[llvm] [AMDGPU] Legalize 64bit elements for BUILD_VECTOR on gfx942 (PR #145052)

Janek van Oirschot via llvm-commits llvm-commits at lists.llvm.org
Wed Jul 2 07:37:10 PDT 2025


https://github.com/JanekvO updated https://github.com/llvm/llvm-project/pull/145052

>From 19adcef64394d7c9273e45d8aabac93c3080405c Mon Sep 17 00:00:00 2001
From: Janek van Oirschot <Janek.vanOirschot at amd.com>
Date: Fri, 20 Jun 2025 03:24:29 -0700
Subject: [PATCH 1/6] [AMDGPU] Legalize 64bit elements for BUILD_VECTOR on
 gfx942

---
 llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp |   22 +-
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp |    8 +
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     |  114 +-
 llvm/lib/Target/AMDGPU/SIISelLowering.h       |    1 +
 llvm/test/CodeGen/AMDGPU/bf16-conversions.ll  |   16 +-
 .../AMDGPU/buffer-atomic-fadd.f32-no-rtn.ll   |  239 +-
 .../AMDGPU/buffer-atomic-fadd.f32-rtn.ll      |  251 +-
 .../CodeGen/AMDGPU/buffer-atomic-fadd.f64.ll  |  832 +-
 .../AMDGPU/buffer-atomic-fadd.v2f16-no-rtn.ll |  239 +-
 .../AMDGPU/buffer-atomic-fadd.v2f16-rtn.ll    |  251 +-
 llvm/test/CodeGen/AMDGPU/flat-scratch.ll      |   48 +-
 llvm/test/CodeGen/AMDGPU/fmaximum3.ll         |    2 +-
 llvm/test/CodeGen/AMDGPU/fminimum3.ll         |    2 +-
 llvm/test/CodeGen/AMDGPU/i1-to-bf16.ll        |   16 +-
 llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll  |  590 +-
 llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll  |  590 +-
 .../CodeGen/AMDGPU/masked-load-vectortypes.ll |   29 +-
 llvm/test/CodeGen/AMDGPU/maximumnum.ll        | 1461 ++-
 llvm/test/CodeGen/AMDGPU/minimumnum.ll        | 1461 ++-
 llvm/test/CodeGen/AMDGPU/packed-fp32.ll       |  429 +-
 llvm/test/CodeGen/AMDGPU/preload-kernargs.ll  |    8 +-
 .../AMDGPU/shufflevector.v2i64.v2i64.ll       |  234 +-
 .../AMDGPU/shufflevector.v2i64.v3i64.ll       |  575 +-
 .../AMDGPU/shufflevector.v2i64.v4i64.ll       | 1273 ++-
 .../AMDGPU/shufflevector.v2i64.v8i64.ll       | 4339 +++++----
 .../CodeGen/AMDGPU/shufflevector.v2p0.v2p0.ll |  234 +-
 .../CodeGen/AMDGPU/shufflevector.v2p0.v3p0.ll |  575 +-
 .../CodeGen/AMDGPU/shufflevector.v2p0.v4p0.ll | 1273 ++-
 .../AMDGPU/shufflevector.v3i64.v2i64.ll       |  685 +-
 .../AMDGPU/shufflevector.v3i64.v3i64.ll       | 1842 ++--
 .../AMDGPU/shufflevector.v3i64.v4i64.ll       | 5059 +++++-----
 .../CodeGen/AMDGPU/shufflevector.v3p0.v2p0.ll |  685 +-
 .../CodeGen/AMDGPU/shufflevector.v3p0.v3p0.ll | 1842 ++--
 .../CodeGen/AMDGPU/shufflevector.v3p0.v4p0.ll | 5059 +++++-----
 .../AMDGPU/shufflevector.v4i64.v2i64.ll       | 2801 ++++--
 .../AMDGPU/shufflevector.v4i64.v3i64.ll       | 2389 +++--
 .../AMDGPU/shufflevector.v4i64.v4i64.ll       | 8424 +++++++++--------
 .../CodeGen/AMDGPU/shufflevector.v4p0.v2p0.ll | 2801 ++++--
 .../CodeGen/AMDGPU/shufflevector.v4p0.v3p0.ll | 2389 +++--
 .../CodeGen/AMDGPU/shufflevector.v4p0.v4p0.ll | 8424 +++++++++--------
 llvm/test/CodeGen/AMDGPU/smfmac_no_agprs.ll   |   25 +-
 .../test/CodeGen/AMDGPU/vector-reduce-fmax.ll |    7 +-
 .../test/CodeGen/AMDGPU/vector-reduce-fmin.ll |    5 +-
 .../test/CodeGen/AMDGPU/vni8-across-blocks.ll |    5 +-
 44 files changed, 33877 insertions(+), 23677 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 6e990cb2e160c..044c073e7f918 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -440,6 +440,8 @@ void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) {
   EVT EltVT = VT.getVectorElementType();
   SDLoc DL(N);
   SDValue RegClass = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32);
+  unsigned NumRegs = EltVT.getSizeInBits() / 32;
+  bool IsGCN = CurDAG->getSubtarget().getTargetTriple().isAMDGCN();
 
   if (NumVectorElts == 1) {
     CurDAG->SelectNodeTo(N, AMDGPU::COPY_TO_REGCLASS, EltVT, N->getOperand(0),
@@ -449,12 +451,13 @@ void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) {
 
   assert(NumVectorElts <= 32 && "Vectors with more than 32 elements not "
                                   "supported yet");
+  assert((IsGCN || (!IsGCN && NumRegs == 1)) &&
+         "R600 does not support 64-bit reg_seq elements");
   // 32 = Max Num Vector Elements
   // 2 = 2 REG_SEQUENCE operands per element (value, subreg index)
   // 1 = Vector Register Class
   SmallVector<SDValue, 32 * 2 + 1> RegSeqArgs(NumVectorElts * 2 + 1);
 
-  bool IsGCN = CurDAG->getSubtarget().getTargetTriple().isAMDGCN();
   RegSeqArgs[0] = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32);
   bool IsRegSeq = true;
   unsigned NOps = N->getNumOperands();
@@ -464,8 +467,9 @@ void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) {
       IsRegSeq = false;
       break;
     }
-    unsigned Sub = IsGCN ? SIRegisterInfo::getSubRegFromChannel(i)
-                         : R600RegisterInfo::getSubRegFromChannel(i);
+    unsigned Sub =
+        IsGCN ? SIRegisterInfo::getSubRegFromChannel(i * NumRegs, NumRegs)
+              : R600RegisterInfo::getSubRegFromChannel(i);
     RegSeqArgs[1 + (2 * i)] = N->getOperand(i);
     RegSeqArgs[1 + (2 * i) + 1] = CurDAG->getTargetConstant(Sub, DL, MVT::i32);
   }
@@ -475,8 +479,9 @@ void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) {
     MachineSDNode *ImpDef = CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,
                                                    DL, EltVT);
     for (unsigned i = NOps; i < NumVectorElts; ++i) {
-      unsigned Sub = IsGCN ? SIRegisterInfo::getSubRegFromChannel(i)
-                           : R600RegisterInfo::getSubRegFromChannel(i);
+      unsigned Sub =
+          IsGCN ? SIRegisterInfo::getSubRegFromChannel(i * NumRegs, NumRegs)
+                : R600RegisterInfo::getSubRegFromChannel(i);
       RegSeqArgs[1 + (2 * i)] = SDValue(ImpDef, 0);
       RegSeqArgs[1 + (2 * i) + 1] =
           CurDAG->getTargetConstant(Sub, DL, MVT::i32);
@@ -644,9 +649,12 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
       break;
     }
 
-    assert(VT.getVectorElementType().bitsEq(MVT::i32));
+    EVT VET = VT.getVectorElementType();
+    assert(VET.bitsEq(MVT::i32) || VET.bitsEq(MVT::i64));
+    unsigned EltSize = VET.getSizeInBits();
     unsigned RegClassID =
-        SIRegisterInfo::getSGPRClassForBitWidth(NumVectorElts * 32)->getID();
+        SIRegisterInfo::getSGPRClassForBitWidth(NumVectorElts * EltSize)
+            ->getID();
     SelectBuildVector(N, RegClassID);
     return;
   }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index d75c7a178b4a8..8b69808db874d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -5270,6 +5270,14 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::BITCAST: {
     EVT DestVT = N->getValueType(0);
 
+    // Avoid undoing build_vector with 64b elements if subtarget supports 64b
+    // movs (i.e., avoid inf loop through combines).
+    if (Subtarget->isGCN()) {
+      const GCNSubtarget &ST = DAG.getSubtarget<GCNSubtarget>();
+      if (ST.hasMovB64())
+        break;
+    }
+
     // Push casts through vector builds. This helps avoid emitting a large
     // number of copies when materializing floating point vector constants.
     //
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index bb1de58e04fbc..6390bc79427df 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -357,9 +357,12 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
   // Most operations are naturally 32-bit vector operations. We only support
   // load and store of i64 vectors, so promote v2i64 vector operations to v4i32.
   for (MVT Vec64 : {MVT::v2i64, MVT::v2f64}) {
-    setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote);
-    AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v4i32);
-
+    if (STI.hasMovB64())
+      setOperationAction(ISD::BUILD_VECTOR, Vec64, Legal);
+    else {
+      setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote);
+      AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v4i32);
+    }
     setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote);
     AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v4i32);
 
@@ -371,9 +374,12 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
   }
 
   for (MVT Vec64 : {MVT::v3i64, MVT::v3f64}) {
-    setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote);
-    AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v6i32);
-
+    if (STI.hasMovB64())
+      setOperationAction(ISD::BUILD_VECTOR, Vec64, Legal);
+    else {
+      setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote);
+      AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v6i32);
+    }
     setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote);
     AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v6i32);
 
@@ -385,9 +391,12 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
   }
 
   for (MVT Vec64 : {MVT::v4i64, MVT::v4f64}) {
-    setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote);
-    AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v8i32);
-
+    if (STI.hasMovB64())
+      setOperationAction(ISD::BUILD_VECTOR, Vec64, Legal);
+    else {
+      setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote);
+      AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v8i32);
+    }
     setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote);
     AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v8i32);
 
@@ -399,9 +408,12 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
   }
 
   for (MVT Vec64 : {MVT::v8i64, MVT::v8f64}) {
-    setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote);
-    AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v16i32);
-
+    if (STI.hasMovB64())
+      setOperationAction(ISD::BUILD_VECTOR, Vec64, Legal);
+    else {
+      setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote);
+      AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v16i32);
+    }
     setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote);
     AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v16i32);
 
@@ -413,9 +425,12 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
   }
 
   for (MVT Vec64 : {MVT::v16i64, MVT::v16f64}) {
-    setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote);
-    AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v32i32);
-
+    if (STI.hasMovB64())
+      setOperationAction(ISD::BUILD_VECTOR, Vec64, Legal);
+    else {
+      setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote);
+      AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v32i32);
+    }
     setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote);
     AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v32i32);
 
@@ -946,6 +961,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
 
   setTargetDAGCombine({ISD::ADD,
                        ISD::PTRADD,
+                       ISD::BUILD_VECTOR,
                        ISD::UADDO_CARRY,
                        ISD::SUB,
                        ISD::USUBO_CARRY,
@@ -15647,6 +15663,72 @@ SDValue SITargetLowering::performClampCombine(SDNode *N,
   return SDValue(CSrc, 0);
 }
 
+SDValue
+SITargetLowering::performBuildVectorCombine(SDNode *N,
+                                            DAGCombinerInfo &DCI) const {
+  const GCNSubtarget *ST = getSubtarget();
+  if (DCI.Level < AfterLegalizeDAG || !ST->hasMovB64())
+    return SDValue();
+
+  SelectionDAG &DAG = DCI.DAG;
+  SDLoc SL(N);
+  BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N);
+
+  EVT VT = N->getValueType(0);
+  EVT EltVT = VT.getVectorElementType();
+  unsigned SizeBits = VT.getSizeInBits();
+  unsigned EltSize = EltVT.getSizeInBits();
+
+  // Skip if:
+  //  - Value type isn't multiplication of 64 bit (e.g., v3i32), or
+  //  - BuildVector instruction has non-constants, or
+  //  - Element type has already been combined into i64 elements
+  if ((SizeBits % 64) != 0 || !BVN->isConstant() || EltVT == MVT::i64)
+    return SDValue();
+
+  // Construct the 64b values.
+  SmallVector<uint64_t, 8> ImmVals;
+  uint64_t ImmVal = 0;
+  uint64_t ImmSize = 0;
+  for (SDValue Opand : N->ops()) {
+    ConstantSDNode *C = dyn_cast<ConstantSDNode>(Opand);
+    if (!C)
+      return SDValue();
+
+    ImmVal |= C->getZExtValue() << ImmSize;
+    ImmSize += EltSize;
+    if (ImmSize > 64)
+      return SDValue();
+    if (ImmSize == 64) {
+      if (!isUInt<32>(ImmVal))
+        return SDValue();
+      ImmVals.push_back(ImmVal);
+      ImmVal = 0;
+      ImmSize = 0;
+    }
+  }
+
+  // Avoid emitting build_vector with 1 element and directly emit value.
+  if (ImmVals.size() == 1) {
+    SDValue Val = DAG.getConstant(ImmVals[0], SL, MVT::i64);
+    return DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Val);
+  }
+
+  // Construct and return build_vector with 64b elements.
+  if (!ImmVals.empty()) {
+    SmallVector<SDValue, 8> VectorConsts;
+    for (uint64_t I : ImmVals)
+      VectorConsts.push_back(DAG.getConstant(I, SL, MVT::i64));
+    unsigned NewNumElts = SizeBits / 64;
+    LLVMContext &Ctx = *DAG.getContext();
+    EVT NewVT = EVT::getVectorVT(Ctx, MVT::i64, NewNumElts);
+    SDValue BV = DAG.getBuildVector(
+        NewVT, SL, ArrayRef(VectorConsts.begin(), VectorConsts.end()));
+    return DAG.getBitcast(VT, BV);
+  }
+  return SDValue();
+}
+
 SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
                                             DAGCombinerInfo &DCI) const {
   switch (N->getOpcode()) {
@@ -15736,6 +15818,8 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
     return performFCanonicalizeCombine(N, DCI);
   case AMDGPUISD::RCP:
     return performRcpCombine(N, DCI);
+  case ISD::BUILD_VECTOR:
+    return performBuildVectorCombine(N, DCI);
   case ISD::FLDEXP:
   case AMDGPUISD::FRACT:
   case AMDGPUISD::RSQ:
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index c66f300ec4cb1..38d60610ab25a 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -232,6 +232,7 @@ class SITargetLowering final : public AMDGPUTargetLowering {
   SDValue performCvtF32UByteNCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performClampCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+  SDValue performBuildVectorCombine(SDNode *N, DAGCombinerInfo &DCI) const;
 
   bool isLegalMUBUFAddressingMode(const AddrMode &AM) const;
 
diff --git a/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll b/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll
index 5b4866c386793..adfc8545780c6 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll
@@ -152,24 +152,24 @@ define amdgpu_ps float @v_test_cvt_v2f64_v2bf16_v(<2 x double> %src) {
 ; GFX-950:       ; %bb.0:
 ; GFX-950-NEXT:    v_cvt_f32_f64_e32 v6, v[2:3]
 ; GFX-950-NEXT:    v_cvt_f64_f32_e32 v[4:5], v6
-; GFX-950-NEXT:    v_and_b32_e32 v7, 1, v6
+; GFX-950-NEXT:    v_and_b32_e32 v8, 1, v6
 ; GFX-950-NEXT:    v_cmp_gt_f64_e64 s[2:3], |v[2:3]|, |v[4:5]|
 ; GFX-950-NEXT:    v_cmp_nlg_f64_e32 vcc, v[2:3], v[4:5]
-; GFX-950-NEXT:    v_cmp_eq_u32_e64 s[0:1], 1, v7
+; GFX-950-NEXT:    v_cmp_eq_u32_e64 s[0:1], 1, v8
 ; GFX-950-NEXT:    v_cndmask_b32_e64 v2, -1, 1, s[2:3]
+; GFX-950-NEXT:    v_cvt_f32_f64_e32 v7, v[0:1]
 ; GFX-950-NEXT:    v_add_u32_e32 v2, v6, v2
 ; GFX-950-NEXT:    s_or_b64 vcc, s[0:1], vcc
-; GFX-950-NEXT:    v_cvt_f32_f64_e32 v5, v[0:1]
 ; GFX-950-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
-; GFX-950-NEXT:    v_cvt_f64_f32_e32 v[2:3], v5
-; GFX-950-NEXT:    v_and_b32_e32 v6, 1, v5
+; GFX-950-NEXT:    v_cvt_f64_f32_e32 v[2:3], v7
+; GFX-950-NEXT:    v_and_b32_e32 v5, 1, v7
 ; GFX-950-NEXT:    v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, |v[2:3]|
 ; GFX-950-NEXT:    v_cmp_nlg_f64_e32 vcc, v[0:1], v[2:3]
-; GFX-950-NEXT:    v_cmp_eq_u32_e64 s[0:1], 1, v6
+; GFX-950-NEXT:    v_cmp_eq_u32_e64 s[0:1], 1, v5
 ; GFX-950-NEXT:    v_cndmask_b32_e64 v0, -1, 1, s[2:3]
-; GFX-950-NEXT:    v_add_u32_e32 v0, v5, v0
+; GFX-950-NEXT:    v_add_u32_e32 v0, v7, v0
 ; GFX-950-NEXT:    s_or_b64 vcc, s[0:1], vcc
-; GFX-950-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
+; GFX-950-NEXT:    v_cndmask_b32_e32 v0, v0, v7, vcc
 ; GFX-950-NEXT:    v_cvt_pk_bf16_f32 v0, v0, v4
 ; GFX-950-NEXT:    ; return to shader part epilog
   %res = fptrunc <2 x double> %src to <2 x bfloat>
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.f32-no-rtn.ll b/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.f32-no-rtn.ll
index a14114358433a..9b783ed9d7772 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.f32-no-rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.f32-no-rtn.ll
@@ -1,11 +1,11 @@
 ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX908_GFX11 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX942 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX942 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefixes=GFX90A_GFX942,GFX90A %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefixes=GFX90A_GFX942,GFX942 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX908_GFX11 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -enable-new-pm -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX908_GFX11 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -enable-new-pm -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX942 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -enable-new-pm -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX942 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -enable-new-pm -stop-after=amdgpu-isel < %s | FileCheck -check-prefixes=GFX90A_GFX942,GFX90A %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -enable-new-pm -stop-after=amdgpu-isel < %s | FileCheck -check-prefixes=GFX90A_GFX942,GFX942 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -enable-new-pm -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX908_GFX11 %s
 
 define amdgpu_ps void @buffer_atomic_fadd_f32_offset_no_rtn(float %val, <4 x i32> inreg %rsrc, i32 inreg %soffset) {
@@ -167,25 +167,41 @@ define amdgpu_ps void @buffer_ptr_atomic_fadd_f32_offset_no_rtn(float %val, ptr
   ; GFX908_GFX11-NEXT:   BUFFER_ATOMIC_ADD_F32_OFFSET [[COPY5]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
   ; GFX908_GFX11-NEXT:   S_ENDPGM 0
   ;
-  ; GFX90A_GFX942-LABEL: name: buffer_ptr_atomic_fadd_f32_offset_no_rtn
-  ; GFX90A_GFX942: bb.0 (%ir-block.0):
-  ; GFX90A_GFX942-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4
-  ; GFX90A_GFX942-NEXT: {{  $}}
-  ; GFX90A_GFX942-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
-  ; GFX90A_GFX942-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-  ; GFX90A_GFX942-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-  ; GFX90A_GFX942-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-  ; GFX90A_GFX942-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-  ; GFX90A_GFX942-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY6:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY6]], %subreg.sub3
-  ; GFX90A_GFX942-NEXT:   BUFFER_ATOMIC_ADD_F32_OFFSET [[COPY5]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
-  ; GFX90A_GFX942-NEXT:   S_ENDPGM 0
+  ; GFX90A-LABEL: name: buffer_ptr_atomic_fadd_f32_offset_no_rtn
+  ; GFX90A: bb.0 (%ir-block.0):
+  ; GFX90A-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4
+  ; GFX90A-NEXT: {{  $}}
+  ; GFX90A-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+  ; GFX90A-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+  ; GFX90A-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; GFX90A-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; GFX90A-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; GFX90A-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GFX90A-NEXT:   [[COPY6:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+  ; GFX90A-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1
+  ; GFX90A-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1
+  ; GFX90A-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY6]], %subreg.sub3
+  ; GFX90A-NEXT:   BUFFER_ATOMIC_ADD_F32_OFFSET [[COPY5]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
+  ; GFX90A-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX942-LABEL: name: buffer_ptr_atomic_fadd_f32_offset_no_rtn
+  ; GFX942: bb.0 (%ir-block.0):
+  ; GFX942-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4
+  ; GFX942-NEXT: {{  $}}
+  ; GFX942-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+  ; GFX942-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+  ; GFX942-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; GFX942-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; GFX942-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; GFX942-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX942-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GFX942-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1
+  ; GFX942-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[REG_SEQUENCE1]], %subreg.sub0_sub1, killed [[REG_SEQUENCE]], %subreg.sub2_sub3
+  ; GFX942-NEXT:   BUFFER_ATOMIC_ADD_F32_OFFSET [[COPY5]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
+  ; GFX942-NEXT:   S_ENDPGM 0
   %ret = call float @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 0, i32 %soffset, i32 0)
   ret void
 }
@@ -212,26 +228,43 @@ define amdgpu_ps void @buffer_ptr_atomic_fadd_f32_offen_no_rtn(float %val, ptr a
   ; GFX908_GFX11-NEXT:   BUFFER_ATOMIC_ADD_F32_OFFEN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
   ; GFX908_GFX11-NEXT:   S_ENDPGM 0
   ;
-  ; GFX90A_GFX942-LABEL: name: buffer_ptr_atomic_fadd_f32_offen_no_rtn
-  ; GFX90A_GFX942: bb.0 (%ir-block.0):
-  ; GFX90A_GFX942-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $sgpr4
-  ; GFX90A_GFX942-NEXT: {{  $}}
-  ; GFX90A_GFX942-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
-  ; GFX90A_GFX942-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-  ; GFX90A_GFX942-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-  ; GFX90A_GFX942-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-  ; GFX90A_GFX942-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-  ; GFX90A_GFX942-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-  ; GFX90A_GFX942-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3
-  ; GFX90A_GFX942-NEXT:   BUFFER_ATOMIC_ADD_F32_OFFEN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
-  ; GFX90A_GFX942-NEXT:   S_ENDPGM 0
+  ; GFX90A-LABEL: name: buffer_ptr_atomic_fadd_f32_offen_no_rtn
+  ; GFX90A: bb.0 (%ir-block.0):
+  ; GFX90A-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $sgpr4
+  ; GFX90A-NEXT: {{  $}}
+  ; GFX90A-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+  ; GFX90A-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX90A-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+  ; GFX90A-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; GFX90A-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; GFX90A-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; GFX90A-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+  ; GFX90A-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+  ; GFX90A-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
+  ; GFX90A-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1
+  ; GFX90A-NEXT:   [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3
+  ; GFX90A-NEXT:   BUFFER_ATOMIC_ADD_F32_OFFEN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
+  ; GFX90A-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX942-LABEL: name: buffer_ptr_atomic_fadd_f32_offen_no_rtn
+  ; GFX942: bb.0 (%ir-block.0):
+  ; GFX942-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $sgpr4
+  ; GFX942-NEXT: {{  $}}
+  ; GFX942-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+  ; GFX942-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX942-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+  ; GFX942-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; GFX942-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; GFX942-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; GFX942-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX942-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+  ; GFX942-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
+  ; GFX942-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[REG_SEQUENCE1]], %subreg.sub0_sub1, killed [[REG_SEQUENCE]], %subreg.sub2_sub3
+  ; GFX942-NEXT:   BUFFER_ATOMIC_ADD_F32_OFFEN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
+  ; GFX942-NEXT:   S_ENDPGM 0
   %ret = call float @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0)
   ret void
 }
@@ -258,26 +291,43 @@ define amdgpu_ps void @buffer_ptr_atomic_fadd_f32_idxen_no_rtn(float %val, ptr a
   ; GFX908_GFX11-NEXT:   BUFFER_ATOMIC_ADD_F32_IDXEN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
   ; GFX908_GFX11-NEXT:   S_ENDPGM 0
   ;
-  ; GFX90A_GFX942-LABEL: name: buffer_ptr_atomic_fadd_f32_idxen_no_rtn
-  ; GFX90A_GFX942: bb.0 (%ir-block.0):
-  ; GFX90A_GFX942-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $sgpr4
-  ; GFX90A_GFX942-NEXT: {{  $}}
-  ; GFX90A_GFX942-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
-  ; GFX90A_GFX942-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-  ; GFX90A_GFX942-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-  ; GFX90A_GFX942-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-  ; GFX90A_GFX942-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-  ; GFX90A_GFX942-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-  ; GFX90A_GFX942-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3
-  ; GFX90A_GFX942-NEXT:   BUFFER_ATOMIC_ADD_F32_IDXEN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
-  ; GFX90A_GFX942-NEXT:   S_ENDPGM 0
+  ; GFX90A-LABEL: name: buffer_ptr_atomic_fadd_f32_idxen_no_rtn
+  ; GFX90A: bb.0 (%ir-block.0):
+  ; GFX90A-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $sgpr4
+  ; GFX90A-NEXT: {{  $}}
+  ; GFX90A-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+  ; GFX90A-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX90A-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+  ; GFX90A-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; GFX90A-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; GFX90A-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; GFX90A-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+  ; GFX90A-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+  ; GFX90A-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
+  ; GFX90A-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1
+  ; GFX90A-NEXT:   [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3
+  ; GFX90A-NEXT:   BUFFER_ATOMIC_ADD_F32_IDXEN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
+  ; GFX90A-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX942-LABEL: name: buffer_ptr_atomic_fadd_f32_idxen_no_rtn
+  ; GFX942: bb.0 (%ir-block.0):
+  ; GFX942-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $sgpr4
+  ; GFX942-NEXT: {{  $}}
+  ; GFX942-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+  ; GFX942-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX942-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+  ; GFX942-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; GFX942-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; GFX942-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; GFX942-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX942-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+  ; GFX942-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
+  ; GFX942-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[REG_SEQUENCE1]], %subreg.sub0_sub1, killed [[REG_SEQUENCE]], %subreg.sub2_sub3
+  ; GFX942-NEXT:   BUFFER_ATOMIC_ADD_F32_IDXEN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
+  ; GFX942-NEXT:   S_ENDPGM 0
   %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0)
   ret void
 }
@@ -306,28 +356,47 @@ define amdgpu_ps void @buffer_ptr_atomic_fadd_f32_bothen_no_rtn(float %val, ptr
   ; GFX908_GFX11-NEXT:   BUFFER_ATOMIC_ADD_F32_BOTHEN [[COPY7]], killed [[REG_SEQUENCE3]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 2, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
   ; GFX908_GFX11-NEXT:   S_ENDPGM 0
   ;
-  ; GFX90A_GFX942-LABEL: name: buffer_ptr_atomic_fadd_f32_bothen_no_rtn
-  ; GFX90A_GFX942: bb.0 (%ir-block.0):
-  ; GFX90A_GFX942-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $vgpr2, $sgpr4
-  ; GFX90A_GFX942-NEXT: {{  $}}
-  ; GFX90A_GFX942-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
-  ; GFX90A_GFX942-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-  ; GFX90A_GFX942-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-  ; GFX90A_GFX942-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-  ; GFX90A_GFX942-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-  ; GFX90A_GFX942-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-  ; GFX90A_GFX942-NEXT:   [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-  ; GFX90A_GFX942-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY11:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY11]], %subreg.sub0, killed [[COPY10]], %subreg.sub1, killed [[COPY9]], %subreg.sub2, killed [[COPY8]], %subreg.sub3
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
-  ; GFX90A_GFX942-NEXT:   BUFFER_ATOMIC_ADD_F32_BOTHEN [[COPY7]], killed [[REG_SEQUENCE3]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 2, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
-  ; GFX90A_GFX942-NEXT:   S_ENDPGM 0
+  ; GFX90A-LABEL: name: buffer_ptr_atomic_fadd_f32_bothen_no_rtn
+  ; GFX90A: bb.0 (%ir-block.0):
+  ; GFX90A-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $vgpr2, $sgpr4
+  ; GFX90A-NEXT: {{  $}}
+  ; GFX90A-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+  ; GFX90A-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFX90A-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX90A-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+  ; GFX90A-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; GFX90A-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; GFX90A-NEXT:   [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; GFX90A-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1
+  ; GFX90A-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+  ; GFX90A-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1
+  ; GFX90A-NEXT:   [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1
+  ; GFX90A-NEXT:   [[COPY11:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY11]], %subreg.sub0, killed [[COPY10]], %subreg.sub1, killed [[COPY9]], %subreg.sub2, killed [[COPY8]], %subreg.sub3
+  ; GFX90A-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GFX90A-NEXT:   BUFFER_ATOMIC_ADD_F32_BOTHEN [[COPY7]], killed [[REG_SEQUENCE3]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 2, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
+  ; GFX90A-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX942-LABEL: name: buffer_ptr_atomic_fadd_f32_bothen_no_rtn
+  ; GFX942: bb.0 (%ir-block.0):
+  ; GFX942-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $vgpr2, $sgpr4
+  ; GFX942-NEXT: {{  $}}
+  ; GFX942-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+  ; GFX942-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFX942-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX942-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+  ; GFX942-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; GFX942-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; GFX942-NEXT:   [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; GFX942-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX942-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1
+  ; GFX942-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1
+  ; GFX942-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[REG_SEQUENCE1]], %subreg.sub0_sub1, killed [[REG_SEQUENCE]], %subreg.sub2_sub3
+  ; GFX942-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GFX942-NEXT:   BUFFER_ATOMIC_ADD_F32_BOTHEN [[COPY7]], killed [[REG_SEQUENCE3]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 2, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
+  ; GFX942-NEXT:   S_ENDPGM 0
   %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2)
   ret void
 }
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.f32-rtn.ll b/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.f32-rtn.ll
index eb452dc4b874f..c22360b861a21 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.f32-rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.f32-rtn.ll
@@ -1,9 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX942 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX942 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefixes=GFX90A_GFX942,GFX90A %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefixes=GFX90A_GFX942,GFX942 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -enable-new-pm -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX11 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -enable-new-pm -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX942 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -enable-new-pm -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX942 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -enable-new-pm -stop-after=amdgpu-isel < %s | FileCheck -check-prefixes=GFX90A_GFX942,GFX90A %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -enable-new-pm -stop-after=amdgpu-isel < %s | FileCheck -check-prefixes=GFX90A_GFX942,GFX942 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -enable-new-pm -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX11 %s
 
 define amdgpu_ps float @buffer_atomic_fadd_f32_offset_rtn(float %val, <4 x i32> inreg %rsrc, i32 inreg %soffset) {
@@ -153,26 +153,43 @@ define amdgpu_ps float @buffer_atomic_fadd_f32_bothen_rtn(float %val, <4 x i32>
 }
 
 define amdgpu_ps float @buffer_ptr_atomic_fadd_f32_offset_rtn(float %val, ptr addrspace(8) inreg %rsrc, i32 inreg %soffset) {
-  ; GFX90A_GFX942-LABEL: name: buffer_ptr_atomic_fadd_f32_offset_rtn
-  ; GFX90A_GFX942: bb.0 (%ir-block.0):
-  ; GFX90A_GFX942-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4
-  ; GFX90A_GFX942-NEXT: {{  $}}
-  ; GFX90A_GFX942-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
-  ; GFX90A_GFX942-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-  ; GFX90A_GFX942-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-  ; GFX90A_GFX942-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-  ; GFX90A_GFX942-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-  ; GFX90A_GFX942-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY6:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY6]], %subreg.sub3
-  ; GFX90A_GFX942-NEXT:   [[BUFFER_ATOMIC_ADD_F32_OFFSET_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_OFFSET_RTN [[COPY5]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
-  ; GFX90A_GFX942-NEXT:   $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_OFFSET_RTN]]
-  ; GFX90A_GFX942-NEXT:   SI_RETURN_TO_EPILOG $vgpr0
+  ; GFX90A-LABEL: name: buffer_ptr_atomic_fadd_f32_offset_rtn
+  ; GFX90A: bb.0 (%ir-block.0):
+  ; GFX90A-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4
+  ; GFX90A-NEXT: {{  $}}
+  ; GFX90A-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+  ; GFX90A-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+  ; GFX90A-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; GFX90A-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; GFX90A-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; GFX90A-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GFX90A-NEXT:   [[COPY6:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+  ; GFX90A-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1
+  ; GFX90A-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1
+  ; GFX90A-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY6]], %subreg.sub3
+  ; GFX90A-NEXT:   [[BUFFER_ATOMIC_ADD_F32_OFFSET_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_OFFSET_RTN [[COPY5]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
+  ; GFX90A-NEXT:   $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_OFFSET_RTN]]
+  ; GFX90A-NEXT:   SI_RETURN_TO_EPILOG $vgpr0
+  ;
+  ; GFX942-LABEL: name: buffer_ptr_atomic_fadd_f32_offset_rtn
+  ; GFX942: bb.0 (%ir-block.0):
+  ; GFX942-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4
+  ; GFX942-NEXT: {{  $}}
+  ; GFX942-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+  ; GFX942-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+  ; GFX942-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; GFX942-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; GFX942-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; GFX942-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX942-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GFX942-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1
+  ; GFX942-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[REG_SEQUENCE1]], %subreg.sub0_sub1, killed [[REG_SEQUENCE]], %subreg.sub2_sub3
+  ; GFX942-NEXT:   [[BUFFER_ATOMIC_ADD_F32_OFFSET_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_OFFSET_RTN [[COPY5]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
+  ; GFX942-NEXT:   $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_OFFSET_RTN]]
+  ; GFX942-NEXT:   SI_RETURN_TO_EPILOG $vgpr0
   ;
   ; GFX11-LABEL: name: buffer_ptr_atomic_fadd_f32_offset_rtn
   ; GFX11: bb.0 (%ir-block.0):
@@ -199,27 +216,45 @@ define amdgpu_ps float @buffer_ptr_atomic_fadd_f32_offset_rtn(float %val, ptr ad
 }
 
 define amdgpu_ps float @buffer_ptr_atomic_fadd_f32_offen_rtn(float %val, ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) {
-  ; GFX90A_GFX942-LABEL: name: buffer_ptr_atomic_fadd_f32_offen_rtn
-  ; GFX90A_GFX942: bb.0 (%ir-block.0):
-  ; GFX90A_GFX942-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $sgpr4
-  ; GFX90A_GFX942-NEXT: {{  $}}
-  ; GFX90A_GFX942-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
-  ; GFX90A_GFX942-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-  ; GFX90A_GFX942-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-  ; GFX90A_GFX942-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-  ; GFX90A_GFX942-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-  ; GFX90A_GFX942-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-  ; GFX90A_GFX942-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3
-  ; GFX90A_GFX942-NEXT:   [[BUFFER_ATOMIC_ADD_F32_OFFEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_OFFEN_RTN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
-  ; GFX90A_GFX942-NEXT:   $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_OFFEN_RTN]]
-  ; GFX90A_GFX942-NEXT:   SI_RETURN_TO_EPILOG $vgpr0
+  ; GFX90A-LABEL: name: buffer_ptr_atomic_fadd_f32_offen_rtn
+  ; GFX90A: bb.0 (%ir-block.0):
+  ; GFX90A-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $sgpr4
+  ; GFX90A-NEXT: {{  $}}
+  ; GFX90A-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+  ; GFX90A-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX90A-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+  ; GFX90A-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; GFX90A-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; GFX90A-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; GFX90A-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+  ; GFX90A-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+  ; GFX90A-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
+  ; GFX90A-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1
+  ; GFX90A-NEXT:   [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3
+  ; GFX90A-NEXT:   [[BUFFER_ATOMIC_ADD_F32_OFFEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_OFFEN_RTN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
+  ; GFX90A-NEXT:   $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_OFFEN_RTN]]
+  ; GFX90A-NEXT:   SI_RETURN_TO_EPILOG $vgpr0
+  ;
+  ; GFX942-LABEL: name: buffer_ptr_atomic_fadd_f32_offen_rtn
+  ; GFX942: bb.0 (%ir-block.0):
+  ; GFX942-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $sgpr4
+  ; GFX942-NEXT: {{  $}}
+  ; GFX942-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+  ; GFX942-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX942-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+  ; GFX942-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; GFX942-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; GFX942-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; GFX942-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX942-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+  ; GFX942-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
+  ; GFX942-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[REG_SEQUENCE1]], %subreg.sub0_sub1, killed [[REG_SEQUENCE]], %subreg.sub2_sub3
+  ; GFX942-NEXT:   [[BUFFER_ATOMIC_ADD_F32_OFFEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_OFFEN_RTN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
+  ; GFX942-NEXT:   $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_OFFEN_RTN]]
+  ; GFX942-NEXT:   SI_RETURN_TO_EPILOG $vgpr0
   ;
   ; GFX11-LABEL: name: buffer_ptr_atomic_fadd_f32_offen_rtn
   ; GFX11: bb.0 (%ir-block.0):
@@ -247,27 +282,45 @@ define amdgpu_ps float @buffer_ptr_atomic_fadd_f32_offen_rtn(float %val, ptr add
 }
 
 define amdgpu_ps float @buffer_ptr_atomic_fadd_f32_idxen_rtn(float %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 inreg %soffset) {
-  ; GFX90A_GFX942-LABEL: name: buffer_ptr_atomic_fadd_f32_idxen_rtn
-  ; GFX90A_GFX942: bb.0 (%ir-block.0):
-  ; GFX90A_GFX942-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $sgpr4
-  ; GFX90A_GFX942-NEXT: {{  $}}
-  ; GFX90A_GFX942-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
-  ; GFX90A_GFX942-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-  ; GFX90A_GFX942-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-  ; GFX90A_GFX942-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-  ; GFX90A_GFX942-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-  ; GFX90A_GFX942-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-  ; GFX90A_GFX942-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3
-  ; GFX90A_GFX942-NEXT:   [[BUFFER_ATOMIC_ADD_F32_IDXEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_IDXEN_RTN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
-  ; GFX90A_GFX942-NEXT:   $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_IDXEN_RTN]]
-  ; GFX90A_GFX942-NEXT:   SI_RETURN_TO_EPILOG $vgpr0
+  ; GFX90A-LABEL: name: buffer_ptr_atomic_fadd_f32_idxen_rtn
+  ; GFX90A: bb.0 (%ir-block.0):
+  ; GFX90A-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $sgpr4
+  ; GFX90A-NEXT: {{  $}}
+  ; GFX90A-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+  ; GFX90A-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX90A-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+  ; GFX90A-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; GFX90A-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; GFX90A-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; GFX90A-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+  ; GFX90A-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+  ; GFX90A-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
+  ; GFX90A-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1
+  ; GFX90A-NEXT:   [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3
+  ; GFX90A-NEXT:   [[BUFFER_ATOMIC_ADD_F32_IDXEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_IDXEN_RTN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
+  ; GFX90A-NEXT:   $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_IDXEN_RTN]]
+  ; GFX90A-NEXT:   SI_RETURN_TO_EPILOG $vgpr0
+  ;
+  ; GFX942-LABEL: name: buffer_ptr_atomic_fadd_f32_idxen_rtn
+  ; GFX942: bb.0 (%ir-block.0):
+  ; GFX942-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $sgpr4
+  ; GFX942-NEXT: {{  $}}
+  ; GFX942-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+  ; GFX942-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX942-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+  ; GFX942-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; GFX942-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; GFX942-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; GFX942-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX942-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+  ; GFX942-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
+  ; GFX942-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[REG_SEQUENCE1]], %subreg.sub0_sub1, killed [[REG_SEQUENCE]], %subreg.sub2_sub3
+  ; GFX942-NEXT:   [[BUFFER_ATOMIC_ADD_F32_IDXEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_IDXEN_RTN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
+  ; GFX942-NEXT:   $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_IDXEN_RTN]]
+  ; GFX942-NEXT:   SI_RETURN_TO_EPILOG $vgpr0
   ;
   ; GFX11-LABEL: name: buffer_ptr_atomic_fadd_f32_idxen_rtn
   ; GFX11: bb.0 (%ir-block.0):
@@ -295,29 +348,49 @@ define amdgpu_ps float @buffer_ptr_atomic_fadd_f32_idxen_rtn(float %val, ptr add
 }
 
 define amdgpu_ps float @buffer_ptr_atomic_fadd_f32_bothen_rtn(float %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
-  ; GFX90A_GFX942-LABEL: name: buffer_ptr_atomic_fadd_f32_bothen_rtn
-  ; GFX90A_GFX942: bb.0 (%ir-block.0):
-  ; GFX90A_GFX942-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $vgpr2, $sgpr4
-  ; GFX90A_GFX942-NEXT: {{  $}}
-  ; GFX90A_GFX942-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
-  ; GFX90A_GFX942-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-  ; GFX90A_GFX942-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-  ; GFX90A_GFX942-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-  ; GFX90A_GFX942-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-  ; GFX90A_GFX942-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-  ; GFX90A_GFX942-NEXT:   [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-  ; GFX90A_GFX942-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY11:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY11]], %subreg.sub0, killed [[COPY10]], %subreg.sub1, killed [[COPY9]], %subreg.sub2, killed [[COPY8]], %subreg.sub3
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
-  ; GFX90A_GFX942-NEXT:   [[BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN [[COPY7]], killed [[REG_SEQUENCE3]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
-  ; GFX90A_GFX942-NEXT:   $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN]]
-  ; GFX90A_GFX942-NEXT:   SI_RETURN_TO_EPILOG $vgpr0
+  ; GFX90A-LABEL: name: buffer_ptr_atomic_fadd_f32_bothen_rtn
+  ; GFX90A: bb.0 (%ir-block.0):
+  ; GFX90A-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $vgpr2, $sgpr4
+  ; GFX90A-NEXT: {{  $}}
+  ; GFX90A-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+  ; GFX90A-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFX90A-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX90A-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+  ; GFX90A-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; GFX90A-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; GFX90A-NEXT:   [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; GFX90A-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1
+  ; GFX90A-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+  ; GFX90A-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1
+  ; GFX90A-NEXT:   [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1
+  ; GFX90A-NEXT:   [[COPY11:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY11]], %subreg.sub0, killed [[COPY10]], %subreg.sub1, killed [[COPY9]], %subreg.sub2, killed [[COPY8]], %subreg.sub3
+  ; GFX90A-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GFX90A-NEXT:   [[BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN [[COPY7]], killed [[REG_SEQUENCE3]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
+  ; GFX90A-NEXT:   $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN]]
+  ; GFX90A-NEXT:   SI_RETURN_TO_EPILOG $vgpr0
+  ;
+  ; GFX942-LABEL: name: buffer_ptr_atomic_fadd_f32_bothen_rtn
+  ; GFX942: bb.0 (%ir-block.0):
+  ; GFX942-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $vgpr2, $sgpr4
+  ; GFX942-NEXT: {{  $}}
+  ; GFX942-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+  ; GFX942-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFX942-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX942-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+  ; GFX942-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; GFX942-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; GFX942-NEXT:   [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; GFX942-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX942-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1
+  ; GFX942-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1
+  ; GFX942-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[REG_SEQUENCE1]], %subreg.sub0_sub1, killed [[REG_SEQUENCE]], %subreg.sub2_sub3
+  ; GFX942-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GFX942-NEXT:   [[BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN [[COPY7]], killed [[REG_SEQUENCE3]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
+  ; GFX942-NEXT:   $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN]]
+  ; GFX942-NEXT:   SI_RETURN_TO_EPILOG $vgpr0
   ;
   ; GFX11-LABEL: name: buffer_ptr_atomic_fadd_f32_bothen_rtn
   ; GFX11: bb.0 (%ir-block.0):
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.f64.ll b/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.f64.ll
index 37928a78622a6..fe0e9fd1d7fe5 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.f64.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX942 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX942 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -enable-new-pm -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX942 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -enable-new-pm -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX942 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefixes=GFX90A_GFX942,GFX90A %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefixes=GFX90A_GFX942,GFX942 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -enable-new-pm -stop-after=amdgpu-isel < %s | FileCheck -check-prefixes=GFX90A_GFX942,GFx90A %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -enable-new-pm -stop-after=amdgpu-isel < %s | FileCheck -check-prefixes=GFX90A_GFX942,GFX942 %s
 
 define amdgpu_ps void @buffer_atomic_fadd_f64_offset_no_rtn(double %val, <4 x i32> inreg %rsrc, i32 inreg %soffset) {
   ; GFX90A_GFX942-LABEL: name: buffer_atomic_fadd_f64_offset_no_rtn
@@ -207,251 +207,655 @@ define amdgpu_ps double @buffer_atomic_fadd_f64_bothen_rtn(double %val, <4 x i32
 }
 
 define amdgpu_ps void @buffer_ptr_atomic_fadd_f64_offset_no_rtn(double %val, ptr addrspace(8) inreg %rsrc, i32 inreg %soffset) {
-  ; GFX90A_GFX942-LABEL: name: buffer_ptr_atomic_fadd_f64_offset_no_rtn
-  ; GFX90A_GFX942: bb.0 (%ir-block.0):
-  ; GFX90A_GFX942-NEXT:   liveins: $vgpr0, $vgpr1, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4
-  ; GFX90A_GFX942-NEXT: {{  $}}
-  ; GFX90A_GFX942-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
-  ; GFX90A_GFX942-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-  ; GFX90A_GFX942-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-  ; GFX90A_GFX942-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-  ; GFX90A_GFX942-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-  ; GFX90A_GFX942-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-  ; GFX90A_GFX942-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY11:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]]
-  ; GFX90A_GFX942-NEXT:   BUFFER_ATOMIC_ADD_F64_OFFSET killed [[COPY11]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8)
-  ; GFX90A_GFX942-NEXT:   S_ENDPGM 0
+  ; GFX90A-LABEL: name: buffer_ptr_atomic_fadd_f64_offset_no_rtn
+  ; GFX90A: bb.0 (%ir-block.0):
+  ; GFX90A-NEXT:   liveins: $vgpr0, $vgpr1, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4
+  ; GFX90A-NEXT: {{  $}}
+  ; GFX90A-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+  ; GFX90A-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+  ; GFX90A-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; GFX90A-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; GFX90A-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; GFX90A-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX90A-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GFX90A-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+  ; GFX90A-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1
+  ; GFX90A-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1
+  ; GFX90A-NEXT:   [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3
+  ; GFX90A-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1
+  ; GFX90A-NEXT:   [[COPY11:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]]
+  ; GFX90A-NEXT:   BUFFER_ATOMIC_ADD_F64_OFFSET killed [[COPY11]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8)
+  ; GFX90A-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX942-LABEL: name: buffer_ptr_atomic_fadd_f64_offset_no_rtn
+  ; GFX942: bb.0 (%ir-block.0):
+  ; GFX942-NEXT:   liveins: $vgpr0, $vgpr1, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4
+  ; GFX942-NEXT: {{  $}}
+  ; GFX942-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+  ; GFX942-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+  ; GFX942-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; GFX942-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; GFX942-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; GFX942-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX942-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX942-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GFX942-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1
+  ; GFX942-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[REG_SEQUENCE1]], %subreg.sub0_sub1, killed [[REG_SEQUENCE]], %subreg.sub2_sub3
+  ; GFX942-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1
+  ; GFX942-NEXT:   [[COPY7:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]]
+  ; GFX942-NEXT:   BUFFER_ATOMIC_ADD_F64_OFFSET killed [[COPY7]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8)
+  ; GFX942-NEXT:   S_ENDPGM 0
+  ;
+  ; GFx90A-LABEL: name: buffer_ptr_atomic_fadd_f64_offset_no_rtn
+  ; GFx90A: bb.0 (%ir-block.0):
+  ; GFx90A-NEXT:   liveins: $vgpr0, $vgpr1, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4
+  ; GFx90A-NEXT: {{  $}}
+  ; GFx90A-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+  ; GFx90A-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+  ; GFx90A-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; GFx90A-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; GFx90A-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; GFx90A-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFx90A-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFx90A-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GFx90A-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+  ; GFx90A-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+  ; GFx90A-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1
+  ; GFx90A-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1
+  ; GFx90A-NEXT:   [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0
+  ; GFx90A-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3
+  ; GFx90A-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1
+  ; GFx90A-NEXT:   [[COPY11:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]]
+  ; GFx90A-NEXT:   BUFFER_ATOMIC_ADD_F64_OFFSET killed [[COPY11]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8)
+  ; GFx90A-NEXT:   S_ENDPGM 0
   %ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f64(double %val, ptr addrspace(8) %rsrc, i32 0, i32 %soffset, i32 0)
   ret void
 }
 
 define amdgpu_ps void @buffer_ptr_atomic_fadd_f64_offen_no_rtn(double %val, ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) {
-  ; GFX90A_GFX942-LABEL: name: buffer_ptr_atomic_fadd_f64_offen_no_rtn
-  ; GFX90A_GFX942: bb.0 (%ir-block.0):
-  ; GFX90A_GFX942-NEXT:   liveins: $vgpr0, $vgpr1, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr2, $sgpr4
-  ; GFX90A_GFX942-NEXT: {{  $}}
-  ; GFX90A_GFX942-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
-  ; GFX90A_GFX942-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-  ; GFX90A_GFX942-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-  ; GFX90A_GFX942-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-  ; GFX90A_GFX942-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-  ; GFX90A_GFX942-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-  ; GFX90A_GFX942-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-  ; GFX90A_GFX942-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY11:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY11]], %subreg.sub0, killed [[COPY10]], %subreg.sub1, killed [[COPY9]], %subreg.sub2, killed [[COPY8]], %subreg.sub3
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY12:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]]
-  ; GFX90A_GFX942-NEXT:   BUFFER_ATOMIC_ADD_F64_OFFEN killed [[COPY12]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8)
-  ; GFX90A_GFX942-NEXT:   S_ENDPGM 0
+  ; GFX90A-LABEL: name: buffer_ptr_atomic_fadd_f64_offen_no_rtn
+  ; GFX90A: bb.0 (%ir-block.0):
+  ; GFX90A-NEXT:   liveins: $vgpr0, $vgpr1, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr2, $sgpr4
+  ; GFX90A-NEXT: {{  $}}
+  ; GFX90A-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+  ; GFX90A-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFX90A-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+  ; GFX90A-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; GFX90A-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; GFX90A-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; GFX90A-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX90A-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+  ; GFX90A-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+  ; GFX90A-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
+  ; GFX90A-NEXT:   [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1
+  ; GFX90A-NEXT:   [[COPY11:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY11]], %subreg.sub0, killed [[COPY10]], %subreg.sub1, killed [[COPY9]], %subreg.sub2, killed [[COPY8]], %subreg.sub3
+  ; GFX90A-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1
+  ; GFX90A-NEXT:   [[COPY12:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]]
+  ; GFX90A-NEXT:   BUFFER_ATOMIC_ADD_F64_OFFEN killed [[COPY12]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8)
+  ; GFX90A-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX942-LABEL: name: buffer_ptr_atomic_fadd_f64_offen_no_rtn
+  ; GFX942: bb.0 (%ir-block.0):
+  ; GFX942-NEXT:   liveins: $vgpr0, $vgpr1, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr2, $sgpr4
+  ; GFX942-NEXT: {{  $}}
+  ; GFX942-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+  ; GFX942-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFX942-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+  ; GFX942-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; GFX942-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; GFX942-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; GFX942-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX942-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX942-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+  ; GFX942-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
+  ; GFX942-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[REG_SEQUENCE1]], %subreg.sub0_sub1, killed [[REG_SEQUENCE]], %subreg.sub2_sub3
+  ; GFX942-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1
+  ; GFX942-NEXT:   [[COPY8:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]]
+  ; GFX942-NEXT:   BUFFER_ATOMIC_ADD_F64_OFFEN killed [[COPY8]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8)
+  ; GFX942-NEXT:   S_ENDPGM 0
+  ;
+  ; GFx90A-LABEL: name: buffer_ptr_atomic_fadd_f64_offen_no_rtn
+  ; GFx90A: bb.0 (%ir-block.0):
+  ; GFx90A-NEXT:   liveins: $vgpr0, $vgpr1, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr2, $sgpr4
+  ; GFx90A-NEXT: {{  $}}
+  ; GFx90A-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+  ; GFx90A-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFx90A-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+  ; GFx90A-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; GFx90A-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; GFx90A-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; GFx90A-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFx90A-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFx90A-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+  ; GFx90A-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+  ; GFx90A-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+  ; GFx90A-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
+  ; GFx90A-NEXT:   [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1
+  ; GFx90A-NEXT:   [[COPY11:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0
+  ; GFx90A-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY11]], %subreg.sub0, killed [[COPY10]], %subreg.sub1, killed [[COPY9]], %subreg.sub2, killed [[COPY8]], %subreg.sub3
+  ; GFx90A-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1
+  ; GFx90A-NEXT:   [[COPY12:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]]
+  ; GFx90A-NEXT:   BUFFER_ATOMIC_ADD_F64_OFFEN killed [[COPY12]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8)
+  ; GFx90A-NEXT:   S_ENDPGM 0
   %ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f64(double %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0)
   ret void
 }
 
 define amdgpu_ps void @buffer_ptr_atomic_fadd_f64_idxen_no_rtn(double %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 inreg %soffset) {
-  ; GFX90A_GFX942-LABEL: name: buffer_ptr_atomic_fadd_f64_idxen_no_rtn
-  ; GFX90A_GFX942: bb.0 (%ir-block.0):
-  ; GFX90A_GFX942-NEXT:   liveins: $vgpr0, $vgpr1, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr2, $sgpr4
-  ; GFX90A_GFX942-NEXT: {{  $}}
-  ; GFX90A_GFX942-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
-  ; GFX90A_GFX942-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-  ; GFX90A_GFX942-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-  ; GFX90A_GFX942-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-  ; GFX90A_GFX942-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-  ; GFX90A_GFX942-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-  ; GFX90A_GFX942-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-  ; GFX90A_GFX942-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY11:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY11]], %subreg.sub0, killed [[COPY10]], %subreg.sub1, killed [[COPY9]], %subreg.sub2, killed [[COPY8]], %subreg.sub3
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY12:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]]
-  ; GFX90A_GFX942-NEXT:   BUFFER_ATOMIC_ADD_F64_IDXEN killed [[COPY12]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8)
-  ; GFX90A_GFX942-NEXT:   S_ENDPGM 0
+  ; GFX90A-LABEL: name: buffer_ptr_atomic_fadd_f64_idxen_no_rtn
+  ; GFX90A: bb.0 (%ir-block.0):
+  ; GFX90A-NEXT:   liveins: $vgpr0, $vgpr1, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr2, $sgpr4
+  ; GFX90A-NEXT: {{  $}}
+  ; GFX90A-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+  ; GFX90A-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFX90A-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+  ; GFX90A-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; GFX90A-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; GFX90A-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; GFX90A-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX90A-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+  ; GFX90A-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+  ; GFX90A-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
+  ; GFX90A-NEXT:   [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1
+  ; GFX90A-NEXT:   [[COPY11:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY11]], %subreg.sub0, killed [[COPY10]], %subreg.sub1, killed [[COPY9]], %subreg.sub2, killed [[COPY8]], %subreg.sub3
+  ; GFX90A-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1
+  ; GFX90A-NEXT:   [[COPY12:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]]
+  ; GFX90A-NEXT:   BUFFER_ATOMIC_ADD_F64_IDXEN killed [[COPY12]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8)
+  ; GFX90A-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX942-LABEL: name: buffer_ptr_atomic_fadd_f64_idxen_no_rtn
+  ; GFX942: bb.0 (%ir-block.0):
+  ; GFX942-NEXT:   liveins: $vgpr0, $vgpr1, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr2, $sgpr4
+  ; GFX942-NEXT: {{  $}}
+  ; GFX942-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+  ; GFX942-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFX942-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+  ; GFX942-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; GFX942-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; GFX942-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; GFX942-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX942-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX942-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+  ; GFX942-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
+  ; GFX942-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[REG_SEQUENCE1]], %subreg.sub0_sub1, killed [[REG_SEQUENCE]], %subreg.sub2_sub3
+  ; GFX942-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1
+  ; GFX942-NEXT:   [[COPY8:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]]
+  ; GFX942-NEXT:   BUFFER_ATOMIC_ADD_F64_IDXEN killed [[COPY8]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8)
+  ; GFX942-NEXT:   S_ENDPGM 0
+  ;
+  ; GFx90A-LABEL: name: buffer_ptr_atomic_fadd_f64_idxen_no_rtn
+  ; GFx90A: bb.0 (%ir-block.0):
+  ; GFx90A-NEXT:   liveins: $vgpr0, $vgpr1, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr2, $sgpr4
+  ; GFx90A-NEXT: {{  $}}
+  ; GFx90A-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+  ; GFx90A-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFx90A-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+  ; GFx90A-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; GFx90A-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; GFx90A-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; GFx90A-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFx90A-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFx90A-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+  ; GFx90A-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+  ; GFx90A-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+  ; GFx90A-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
+  ; GFx90A-NEXT:   [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1
+  ; GFx90A-NEXT:   [[COPY11:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0
+  ; GFx90A-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY11]], %subreg.sub0, killed [[COPY10]], %subreg.sub1, killed [[COPY9]], %subreg.sub2, killed [[COPY8]], %subreg.sub3
+  ; GFx90A-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1
+  ; GFx90A-NEXT:   [[COPY12:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]]
+  ; GFx90A-NEXT:   BUFFER_ATOMIC_ADD_F64_IDXEN killed [[COPY12]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8)
+  ; GFx90A-NEXT:   S_ENDPGM 0
   %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0)
   ret void
 }
 
 define amdgpu_ps void @buffer_ptr_atomic_fadd_f64_bothen_no_rtn(double %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
-  ; GFX90A_GFX942-LABEL: name: buffer_ptr_atomic_fadd_f64_bothen_no_rtn
-  ; GFX90A_GFX942: bb.0 (%ir-block.0):
-  ; GFX90A_GFX942-NEXT:   liveins: $vgpr0, $vgpr1, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr2, $vgpr3, $sgpr4
-  ; GFX90A_GFX942-NEXT: {{  $}}
-  ; GFX90A_GFX942-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
-  ; GFX90A_GFX942-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-  ; GFX90A_GFX942-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-  ; GFX90A_GFX942-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-  ; GFX90A_GFX942-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-  ; GFX90A_GFX942-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-  ; GFX90A_GFX942-NEXT:   [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-  ; GFX90A_GFX942-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-  ; GFX90A_GFX942-NEXT:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY11:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY12:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY12]], %subreg.sub0, killed [[COPY11]], %subreg.sub1, killed [[COPY10]], %subreg.sub2, killed [[COPY9]], %subreg.sub3
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY7]], %subreg.sub1
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE4:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY13:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]]
-  ; GFX90A_GFX942-NEXT:   BUFFER_ATOMIC_ADD_F64_BOTHEN killed [[COPY13]], killed [[REG_SEQUENCE4]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 2, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8)
-  ; GFX90A_GFX942-NEXT:   S_ENDPGM 0
+  ; GFX90A-LABEL: name: buffer_ptr_atomic_fadd_f64_bothen_no_rtn
+  ; GFX90A: bb.0 (%ir-block.0):
+  ; GFX90A-NEXT:   liveins: $vgpr0, $vgpr1, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr2, $vgpr3, $sgpr4
+  ; GFX90A-NEXT: {{  $}}
+  ; GFX90A-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+  ; GFX90A-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; GFX90A-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFX90A-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+  ; GFX90A-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; GFX90A-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; GFX90A-NEXT:   [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; GFX90A-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX90A-NEXT:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1
+  ; GFX90A-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+  ; GFX90A-NEXT:   [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1
+  ; GFX90A-NEXT:   [[COPY11:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1
+  ; GFX90A-NEXT:   [[COPY12:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY12]], %subreg.sub0, killed [[COPY11]], %subreg.sub1, killed [[COPY10]], %subreg.sub2, killed [[COPY9]], %subreg.sub3
+  ; GFX90A-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY7]], %subreg.sub1
+  ; GFX90A-NEXT:   [[REG_SEQUENCE4:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GFX90A-NEXT:   [[COPY13:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]]
+  ; GFX90A-NEXT:   BUFFER_ATOMIC_ADD_F64_BOTHEN killed [[COPY13]], killed [[REG_SEQUENCE4]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 2, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8)
+  ; GFX90A-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX942-LABEL: name: buffer_ptr_atomic_fadd_f64_bothen_no_rtn
+  ; GFX942: bb.0 (%ir-block.0):
+  ; GFX942-NEXT:   liveins: $vgpr0, $vgpr1, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr2, $vgpr3, $sgpr4
+  ; GFX942-NEXT: {{  $}}
+  ; GFX942-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+  ; GFX942-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; GFX942-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFX942-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+  ; GFX942-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; GFX942-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; GFX942-NEXT:   [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; GFX942-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX942-NEXT:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX942-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1
+  ; GFX942-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1
+  ; GFX942-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[REG_SEQUENCE1]], %subreg.sub0_sub1, killed [[REG_SEQUENCE]], %subreg.sub2_sub3
+  ; GFX942-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY7]], %subreg.sub1
+  ; GFX942-NEXT:   [[REG_SEQUENCE4:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GFX942-NEXT:   [[COPY9:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]]
+  ; GFX942-NEXT:   BUFFER_ATOMIC_ADD_F64_BOTHEN killed [[COPY9]], killed [[REG_SEQUENCE4]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 2, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8)
+  ; GFX942-NEXT:   S_ENDPGM 0
+  ;
+  ; GFx90A-LABEL: name: buffer_ptr_atomic_fadd_f64_bothen_no_rtn
+  ; GFx90A: bb.0 (%ir-block.0):
+  ; GFx90A-NEXT:   liveins: $vgpr0, $vgpr1, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr2, $vgpr3, $sgpr4
+  ; GFx90A-NEXT: {{  $}}
+  ; GFx90A-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+  ; GFx90A-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; GFx90A-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFx90A-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+  ; GFx90A-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; GFx90A-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; GFx90A-NEXT:   [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; GFx90A-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFx90A-NEXT:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFx90A-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1
+  ; GFx90A-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+  ; GFx90A-NEXT:   [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+  ; GFx90A-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1
+  ; GFx90A-NEXT:   [[COPY11:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1
+  ; GFx90A-NEXT:   [[COPY12:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0
+  ; GFx90A-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY12]], %subreg.sub0, killed [[COPY11]], %subreg.sub1, killed [[COPY10]], %subreg.sub2, killed [[COPY9]], %subreg.sub3
+  ; GFx90A-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY7]], %subreg.sub1
+  ; GFx90A-NEXT:   [[REG_SEQUENCE4:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GFx90A-NEXT:   [[COPY13:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]]
+  ; GFx90A-NEXT:   BUFFER_ATOMIC_ADD_F64_BOTHEN killed [[COPY13]], killed [[REG_SEQUENCE4]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 2, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8)
+  ; GFx90A-NEXT:   S_ENDPGM 0
   %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2)
   ret void
 }
 
 define amdgpu_ps double @buffer_ptr_atomic_fadd_f64_offset_rtn(double %val, ptr addrspace(8) inreg %rsrc, i32 inreg %soffset) {
-  ; GFX90A_GFX942-LABEL: name: buffer_ptr_atomic_fadd_f64_offset_rtn
-  ; GFX90A_GFX942: bb.0 (%ir-block.0):
-  ; GFX90A_GFX942-NEXT:   liveins: $vgpr0, $vgpr1, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4
-  ; GFX90A_GFX942-NEXT: {{  $}}
-  ; GFX90A_GFX942-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
-  ; GFX90A_GFX942-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-  ; GFX90A_GFX942-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-  ; GFX90A_GFX942-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-  ; GFX90A_GFX942-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-  ; GFX90A_GFX942-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-  ; GFX90A_GFX942-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY11:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]]
-  ; GFX90A_GFX942-NEXT:   [[BUFFER_ATOMIC_ADD_F64_OFFSET_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_OFFSET_RTN [[COPY11]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8)
-  ; GFX90A_GFX942-NEXT:   [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFSET_RTN]].sub0
-  ; GFX90A_GFX942-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY12]], implicit $exec
-  ; GFX90A_GFX942-NEXT:   [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFSET_RTN]].sub1
-  ; GFX90A_GFX942-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY13]], implicit $exec
-  ; GFX90A_GFX942-NEXT:   $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
-  ; GFX90A_GFX942-NEXT:   $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
-  ; GFX90A_GFX942-NEXT:   SI_RETURN_TO_EPILOG $sgpr0, $sgpr1
+  ; GFX90A-LABEL: name: buffer_ptr_atomic_fadd_f64_offset_rtn
+  ; GFX90A: bb.0 (%ir-block.0):
+  ; GFX90A-NEXT:   liveins: $vgpr0, $vgpr1, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4
+  ; GFX90A-NEXT: {{  $}}
+  ; GFX90A-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+  ; GFX90A-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+  ; GFX90A-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; GFX90A-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; GFX90A-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; GFX90A-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX90A-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GFX90A-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+  ; GFX90A-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1
+  ; GFX90A-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1
+  ; GFX90A-NEXT:   [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3
+  ; GFX90A-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1
+  ; GFX90A-NEXT:   [[COPY11:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]]
+  ; GFX90A-NEXT:   [[BUFFER_ATOMIC_ADD_F64_OFFSET_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_OFFSET_RTN [[COPY11]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8)
+  ; GFX90A-NEXT:   [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFSET_RTN]].sub0
+  ; GFX90A-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY12]], implicit $exec
+  ; GFX90A-NEXT:   [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFSET_RTN]].sub1
+  ; GFX90A-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY13]], implicit $exec
+  ; GFX90A-NEXT:   $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+  ; GFX90A-NEXT:   $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
+  ; GFX90A-NEXT:   SI_RETURN_TO_EPILOG $sgpr0, $sgpr1
+  ;
+  ; GFX942-LABEL: name: buffer_ptr_atomic_fadd_f64_offset_rtn
+  ; GFX942: bb.0 (%ir-block.0):
+  ; GFX942-NEXT:   liveins: $vgpr0, $vgpr1, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4
+  ; GFX942-NEXT: {{  $}}
+  ; GFX942-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+  ; GFX942-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+  ; GFX942-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; GFX942-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; GFX942-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; GFX942-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX942-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX942-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GFX942-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1
+  ; GFX942-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[REG_SEQUENCE1]], %subreg.sub0_sub1, killed [[REG_SEQUENCE]], %subreg.sub2_sub3
+  ; GFX942-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1
+  ; GFX942-NEXT:   [[COPY7:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]]
+  ; GFX942-NEXT:   [[BUFFER_ATOMIC_ADD_F64_OFFSET_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_OFFSET_RTN [[COPY7]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8)
+  ; GFX942-NEXT:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFSET_RTN]].sub0
+  ; GFX942-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY8]], implicit $exec
+  ; GFX942-NEXT:   [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFSET_RTN]].sub1
+  ; GFX942-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY9]], implicit $exec
+  ; GFX942-NEXT:   $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+  ; GFX942-NEXT:   $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
+  ; GFX942-NEXT:   SI_RETURN_TO_EPILOG $sgpr0, $sgpr1
+  ;
+  ; GFx90A-LABEL: name: buffer_ptr_atomic_fadd_f64_offset_rtn
+  ; GFx90A: bb.0 (%ir-block.0):
+  ; GFx90A-NEXT:   liveins: $vgpr0, $vgpr1, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4
+  ; GFx90A-NEXT: {{  $}}
+  ; GFx90A-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+  ; GFx90A-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+  ; GFx90A-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; GFx90A-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; GFx90A-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; GFx90A-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFx90A-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFx90A-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GFx90A-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+  ; GFx90A-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+  ; GFx90A-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1
+  ; GFx90A-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1
+  ; GFx90A-NEXT:   [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0
+  ; GFx90A-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3
+  ; GFx90A-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1
+  ; GFx90A-NEXT:   [[COPY11:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]]
+  ; GFx90A-NEXT:   [[BUFFER_ATOMIC_ADD_F64_OFFSET_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_OFFSET_RTN [[COPY11]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8)
+  ; GFx90A-NEXT:   [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFSET_RTN]].sub0
+  ; GFx90A-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY12]], implicit $exec
+  ; GFx90A-NEXT:   [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFSET_RTN]].sub1
+  ; GFx90A-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY13]], implicit $exec
+  ; GFx90A-NEXT:   $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+  ; GFx90A-NEXT:   $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
+  ; GFx90A-NEXT:   SI_RETURN_TO_EPILOG $sgpr0, $sgpr1
   %ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f64(double %val, ptr addrspace(8) %rsrc, i32 0, i32 %soffset, i32 0)
   ret double %ret
 }
 
 define amdgpu_ps double @buffer_ptr_atomic_fadd_f64_offen_rtn(double %val, ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) {
-  ; GFX90A_GFX942-LABEL: name: buffer_ptr_atomic_fadd_f64_offen_rtn
-  ; GFX90A_GFX942: bb.0 (%ir-block.0):
-  ; GFX90A_GFX942-NEXT:   liveins: $vgpr0, $vgpr1, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr2, $sgpr4
-  ; GFX90A_GFX942-NEXT: {{  $}}
-  ; GFX90A_GFX942-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
-  ; GFX90A_GFX942-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-  ; GFX90A_GFX942-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-  ; GFX90A_GFX942-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-  ; GFX90A_GFX942-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-  ; GFX90A_GFX942-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-  ; GFX90A_GFX942-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-  ; GFX90A_GFX942-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY11:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY11]], %subreg.sub0, killed [[COPY10]], %subreg.sub1, killed [[COPY9]], %subreg.sub2, killed [[COPY8]], %subreg.sub3
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY12:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]]
-  ; GFX90A_GFX942-NEXT:   [[BUFFER_ATOMIC_ADD_F64_OFFEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_OFFEN_RTN [[COPY12]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8)
-  ; GFX90A_GFX942-NEXT:   [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFEN_RTN]].sub0
-  ; GFX90A_GFX942-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY13]], implicit $exec
-  ; GFX90A_GFX942-NEXT:   [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFEN_RTN]].sub1
-  ; GFX90A_GFX942-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY14]], implicit $exec
-  ; GFX90A_GFX942-NEXT:   $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
-  ; GFX90A_GFX942-NEXT:   $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
-  ; GFX90A_GFX942-NEXT:   SI_RETURN_TO_EPILOG $sgpr0, $sgpr1
+  ; GFX90A-LABEL: name: buffer_ptr_atomic_fadd_f64_offen_rtn
+  ; GFX90A: bb.0 (%ir-block.0):
+  ; GFX90A-NEXT:   liveins: $vgpr0, $vgpr1, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr2, $sgpr4
+  ; GFX90A-NEXT: {{  $}}
+  ; GFX90A-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+  ; GFX90A-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFX90A-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+  ; GFX90A-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; GFX90A-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; GFX90A-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; GFX90A-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX90A-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+  ; GFX90A-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+  ; GFX90A-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
+  ; GFX90A-NEXT:   [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1
+  ; GFX90A-NEXT:   [[COPY11:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY11]], %subreg.sub0, killed [[COPY10]], %subreg.sub1, killed [[COPY9]], %subreg.sub2, killed [[COPY8]], %subreg.sub3
+  ; GFX90A-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1
+  ; GFX90A-NEXT:   [[COPY12:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]]
+  ; GFX90A-NEXT:   [[BUFFER_ATOMIC_ADD_F64_OFFEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_OFFEN_RTN [[COPY12]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8)
+  ; GFX90A-NEXT:   [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFEN_RTN]].sub0
+  ; GFX90A-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY13]], implicit $exec
+  ; GFX90A-NEXT:   [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFEN_RTN]].sub1
+  ; GFX90A-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY14]], implicit $exec
+  ; GFX90A-NEXT:   $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+  ; GFX90A-NEXT:   $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
+  ; GFX90A-NEXT:   SI_RETURN_TO_EPILOG $sgpr0, $sgpr1
+  ;
+  ; GFX942-LABEL: name: buffer_ptr_atomic_fadd_f64_offen_rtn
+  ; GFX942: bb.0 (%ir-block.0):
+  ; GFX942-NEXT:   liveins: $vgpr0, $vgpr1, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr2, $sgpr4
+  ; GFX942-NEXT: {{  $}}
+  ; GFX942-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+  ; GFX942-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFX942-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+  ; GFX942-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; GFX942-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; GFX942-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; GFX942-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX942-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX942-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+  ; GFX942-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
+  ; GFX942-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[REG_SEQUENCE1]], %subreg.sub0_sub1, killed [[REG_SEQUENCE]], %subreg.sub2_sub3
+  ; GFX942-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1
+  ; GFX942-NEXT:   [[COPY8:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]]
+  ; GFX942-NEXT:   [[BUFFER_ATOMIC_ADD_F64_OFFEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_OFFEN_RTN [[COPY8]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8)
+  ; GFX942-NEXT:   [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFEN_RTN]].sub0
+  ; GFX942-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY9]], implicit $exec
+  ; GFX942-NEXT:   [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFEN_RTN]].sub1
+  ; GFX942-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY10]], implicit $exec
+  ; GFX942-NEXT:   $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+  ; GFX942-NEXT:   $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
+  ; GFX942-NEXT:   SI_RETURN_TO_EPILOG $sgpr0, $sgpr1
+  ;
+  ; GFx90A-LABEL: name: buffer_ptr_atomic_fadd_f64_offen_rtn
+  ; GFx90A: bb.0 (%ir-block.0):
+  ; GFx90A-NEXT:   liveins: $vgpr0, $vgpr1, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr2, $sgpr4
+  ; GFx90A-NEXT: {{  $}}
+  ; GFx90A-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+  ; GFx90A-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFx90A-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+  ; GFx90A-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; GFx90A-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; GFx90A-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; GFx90A-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFx90A-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFx90A-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+  ; GFx90A-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+  ; GFx90A-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+  ; GFx90A-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
+  ; GFx90A-NEXT:   [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1
+  ; GFx90A-NEXT:   [[COPY11:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0
+  ; GFx90A-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY11]], %subreg.sub0, killed [[COPY10]], %subreg.sub1, killed [[COPY9]], %subreg.sub2, killed [[COPY8]], %subreg.sub3
+  ; GFx90A-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1
+  ; GFx90A-NEXT:   [[COPY12:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]]
+  ; GFx90A-NEXT:   [[BUFFER_ATOMIC_ADD_F64_OFFEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_OFFEN_RTN [[COPY12]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8)
+  ; GFx90A-NEXT:   [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFEN_RTN]].sub0
+  ; GFx90A-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY13]], implicit $exec
+  ; GFx90A-NEXT:   [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFEN_RTN]].sub1
+  ; GFx90A-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY14]], implicit $exec
+  ; GFx90A-NEXT:   $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+  ; GFx90A-NEXT:   $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
+  ; GFx90A-NEXT:   SI_RETURN_TO_EPILOG $sgpr0, $sgpr1
   %ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f64(double %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0)
   ret double %ret
 }
 
 define amdgpu_ps double @buffer_ptr_atomic_fadd_f64_idxen_rtn(double %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 inreg %soffset) {
-  ; GFX90A_GFX942-LABEL: name: buffer_ptr_atomic_fadd_f64_idxen_rtn
-  ; GFX90A_GFX942: bb.0 (%ir-block.0):
-  ; GFX90A_GFX942-NEXT:   liveins: $vgpr0, $vgpr1, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr2, $sgpr4
-  ; GFX90A_GFX942-NEXT: {{  $}}
-  ; GFX90A_GFX942-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
-  ; GFX90A_GFX942-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-  ; GFX90A_GFX942-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-  ; GFX90A_GFX942-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-  ; GFX90A_GFX942-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-  ; GFX90A_GFX942-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-  ; GFX90A_GFX942-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-  ; GFX90A_GFX942-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY11:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY11]], %subreg.sub0, killed [[COPY10]], %subreg.sub1, killed [[COPY9]], %subreg.sub2, killed [[COPY8]], %subreg.sub3
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY12:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]]
-  ; GFX90A_GFX942-NEXT:   [[BUFFER_ATOMIC_ADD_F64_IDXEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_IDXEN_RTN [[COPY12]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8)
-  ; GFX90A_GFX942-NEXT:   [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_IDXEN_RTN]].sub0
-  ; GFX90A_GFX942-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY13]], implicit $exec
-  ; GFX90A_GFX942-NEXT:   [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_IDXEN_RTN]].sub1
-  ; GFX90A_GFX942-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY14]], implicit $exec
-  ; GFX90A_GFX942-NEXT:   $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
-  ; GFX90A_GFX942-NEXT:   $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
-  ; GFX90A_GFX942-NEXT:   SI_RETURN_TO_EPILOG $sgpr0, $sgpr1
+  ; GFX90A-LABEL: name: buffer_ptr_atomic_fadd_f64_idxen_rtn
+  ; GFX90A: bb.0 (%ir-block.0):
+  ; GFX90A-NEXT:   liveins: $vgpr0, $vgpr1, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr2, $sgpr4
+  ; GFX90A-NEXT: {{  $}}
+  ; GFX90A-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+  ; GFX90A-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFX90A-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+  ; GFX90A-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; GFX90A-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; GFX90A-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; GFX90A-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX90A-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+  ; GFX90A-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+  ; GFX90A-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
+  ; GFX90A-NEXT:   [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1
+  ; GFX90A-NEXT:   [[COPY11:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY11]], %subreg.sub0, killed [[COPY10]], %subreg.sub1, killed [[COPY9]], %subreg.sub2, killed [[COPY8]], %subreg.sub3
+  ; GFX90A-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1
+  ; GFX90A-NEXT:   [[COPY12:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]]
+  ; GFX90A-NEXT:   [[BUFFER_ATOMIC_ADD_F64_IDXEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_IDXEN_RTN [[COPY12]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8)
+  ; GFX90A-NEXT:   [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_IDXEN_RTN]].sub0
+  ; GFX90A-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY13]], implicit $exec
+  ; GFX90A-NEXT:   [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_IDXEN_RTN]].sub1
+  ; GFX90A-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY14]], implicit $exec
+  ; GFX90A-NEXT:   $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+  ; GFX90A-NEXT:   $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
+  ; GFX90A-NEXT:   SI_RETURN_TO_EPILOG $sgpr0, $sgpr1
+  ;
+  ; GFX942-LABEL: name: buffer_ptr_atomic_fadd_f64_idxen_rtn
+  ; GFX942: bb.0 (%ir-block.0):
+  ; GFX942-NEXT:   liveins: $vgpr0, $vgpr1, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr2, $sgpr4
+  ; GFX942-NEXT: {{  $}}
+  ; GFX942-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+  ; GFX942-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFX942-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+  ; GFX942-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; GFX942-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; GFX942-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; GFX942-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX942-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX942-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+  ; GFX942-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
+  ; GFX942-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[REG_SEQUENCE1]], %subreg.sub0_sub1, killed [[REG_SEQUENCE]], %subreg.sub2_sub3
+  ; GFX942-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1
+  ; GFX942-NEXT:   [[COPY8:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]]
+  ; GFX942-NEXT:   [[BUFFER_ATOMIC_ADD_F64_IDXEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_IDXEN_RTN [[COPY8]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8)
+  ; GFX942-NEXT:   [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_IDXEN_RTN]].sub0
+  ; GFX942-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY9]], implicit $exec
+  ; GFX942-NEXT:   [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_IDXEN_RTN]].sub1
+  ; GFX942-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY10]], implicit $exec
+  ; GFX942-NEXT:   $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+  ; GFX942-NEXT:   $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
+  ; GFX942-NEXT:   SI_RETURN_TO_EPILOG $sgpr0, $sgpr1
+  ;
+  ; GFx90A-LABEL: name: buffer_ptr_atomic_fadd_f64_idxen_rtn
+  ; GFx90A: bb.0 (%ir-block.0):
+  ; GFx90A-NEXT:   liveins: $vgpr0, $vgpr1, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr2, $sgpr4
+  ; GFx90A-NEXT: {{  $}}
+  ; GFx90A-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+  ; GFx90A-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFx90A-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+  ; GFx90A-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; GFx90A-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; GFx90A-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; GFx90A-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFx90A-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFx90A-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+  ; GFx90A-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+  ; GFx90A-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+  ; GFx90A-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
+  ; GFx90A-NEXT:   [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1
+  ; GFx90A-NEXT:   [[COPY11:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0
+  ; GFx90A-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY11]], %subreg.sub0, killed [[COPY10]], %subreg.sub1, killed [[COPY9]], %subreg.sub2, killed [[COPY8]], %subreg.sub3
+  ; GFx90A-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1
+  ; GFx90A-NEXT:   [[COPY12:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]]
+  ; GFx90A-NEXT:   [[BUFFER_ATOMIC_ADD_F64_IDXEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_IDXEN_RTN [[COPY12]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8)
+  ; GFx90A-NEXT:   [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_IDXEN_RTN]].sub0
+  ; GFx90A-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY13]], implicit $exec
+  ; GFx90A-NEXT:   [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_IDXEN_RTN]].sub1
+  ; GFx90A-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY14]], implicit $exec
+  ; GFx90A-NEXT:   $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+  ; GFx90A-NEXT:   $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
+  ; GFx90A-NEXT:   SI_RETURN_TO_EPILOG $sgpr0, $sgpr1
   %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0)
   ret double %ret
 }
 
 define amdgpu_ps double @buffer_ptr_atomic_fadd_f64_bothen_rtn(double %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
-  ; GFX90A_GFX942-LABEL: name: buffer_ptr_atomic_fadd_f64_bothen_rtn
-  ; GFX90A_GFX942: bb.0 (%ir-block.0):
-  ; GFX90A_GFX942-NEXT:   liveins: $vgpr0, $vgpr1, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr2, $vgpr3, $sgpr4
-  ; GFX90A_GFX942-NEXT: {{  $}}
-  ; GFX90A_GFX942-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
-  ; GFX90A_GFX942-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-  ; GFX90A_GFX942-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-  ; GFX90A_GFX942-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-  ; GFX90A_GFX942-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-  ; GFX90A_GFX942-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-  ; GFX90A_GFX942-NEXT:   [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-  ; GFX90A_GFX942-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-  ; GFX90A_GFX942-NEXT:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY11:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY12:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY12]], %subreg.sub0, killed [[COPY11]], %subreg.sub1, killed [[COPY10]], %subreg.sub2, killed [[COPY9]], %subreg.sub3
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY7]], %subreg.sub1
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE4:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY13:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]]
-  ; GFX90A_GFX942-NEXT:   [[BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN [[COPY13]], killed [[REG_SEQUENCE4]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8)
-  ; GFX90A_GFX942-NEXT:   [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN]].sub0
-  ; GFX90A_GFX942-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY14]], implicit $exec
-  ; GFX90A_GFX942-NEXT:   [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN]].sub1
-  ; GFX90A_GFX942-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY15]], implicit $exec
-  ; GFX90A_GFX942-NEXT:   $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
-  ; GFX90A_GFX942-NEXT:   $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
-  ; GFX90A_GFX942-NEXT:   SI_RETURN_TO_EPILOG $sgpr0, $sgpr1
+  ; GFX90A-LABEL: name: buffer_ptr_atomic_fadd_f64_bothen_rtn
+  ; GFX90A: bb.0 (%ir-block.0):
+  ; GFX90A-NEXT:   liveins: $vgpr0, $vgpr1, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr2, $vgpr3, $sgpr4
+  ; GFX90A-NEXT: {{  $}}
+  ; GFX90A-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+  ; GFX90A-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; GFX90A-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFX90A-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+  ; GFX90A-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; GFX90A-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; GFX90A-NEXT:   [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; GFX90A-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX90A-NEXT:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1
+  ; GFX90A-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+  ; GFX90A-NEXT:   [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1
+  ; GFX90A-NEXT:   [[COPY11:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1
+  ; GFX90A-NEXT:   [[COPY12:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY12]], %subreg.sub0, killed [[COPY11]], %subreg.sub1, killed [[COPY10]], %subreg.sub2, killed [[COPY9]], %subreg.sub3
+  ; GFX90A-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY7]], %subreg.sub1
+  ; GFX90A-NEXT:   [[REG_SEQUENCE4:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GFX90A-NEXT:   [[COPY13:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]]
+  ; GFX90A-NEXT:   [[BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN [[COPY13]], killed [[REG_SEQUENCE4]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8)
+  ; GFX90A-NEXT:   [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN]].sub0
+  ; GFX90A-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY14]], implicit $exec
+  ; GFX90A-NEXT:   [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN]].sub1
+  ; GFX90A-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY15]], implicit $exec
+  ; GFX90A-NEXT:   $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+  ; GFX90A-NEXT:   $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
+  ; GFX90A-NEXT:   SI_RETURN_TO_EPILOG $sgpr0, $sgpr1
+  ;
+  ; GFX942-LABEL: name: buffer_ptr_atomic_fadd_f64_bothen_rtn
+  ; GFX942: bb.0 (%ir-block.0):
+  ; GFX942-NEXT:   liveins: $vgpr0, $vgpr1, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr2, $vgpr3, $sgpr4
+  ; GFX942-NEXT: {{  $}}
+  ; GFX942-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+  ; GFX942-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; GFX942-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFX942-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+  ; GFX942-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; GFX942-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; GFX942-NEXT:   [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; GFX942-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX942-NEXT:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX942-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1
+  ; GFX942-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1
+  ; GFX942-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[REG_SEQUENCE1]], %subreg.sub0_sub1, killed [[REG_SEQUENCE]], %subreg.sub2_sub3
+  ; GFX942-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY7]], %subreg.sub1
+  ; GFX942-NEXT:   [[REG_SEQUENCE4:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GFX942-NEXT:   [[COPY9:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]]
+  ; GFX942-NEXT:   [[BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN [[COPY9]], killed [[REG_SEQUENCE4]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8)
+  ; GFX942-NEXT:   [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN]].sub0
+  ; GFX942-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY10]], implicit $exec
+  ; GFX942-NEXT:   [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN]].sub1
+  ; GFX942-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY11]], implicit $exec
+  ; GFX942-NEXT:   $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+  ; GFX942-NEXT:   $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
+  ; GFX942-NEXT:   SI_RETURN_TO_EPILOG $sgpr0, $sgpr1
+  ;
+  ; GFx90A-LABEL: name: buffer_ptr_atomic_fadd_f64_bothen_rtn
+  ; GFx90A: bb.0 (%ir-block.0):
+  ; GFx90A-NEXT:   liveins: $vgpr0, $vgpr1, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr2, $vgpr3, $sgpr4
+  ; GFx90A-NEXT: {{  $}}
+  ; GFx90A-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+  ; GFx90A-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; GFx90A-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFx90A-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+  ; GFx90A-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; GFx90A-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; GFx90A-NEXT:   [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; GFx90A-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFx90A-NEXT:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFx90A-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1
+  ; GFx90A-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+  ; GFx90A-NEXT:   [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+  ; GFx90A-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1
+  ; GFx90A-NEXT:   [[COPY11:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1
+  ; GFx90A-NEXT:   [[COPY12:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0
+  ; GFx90A-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY12]], %subreg.sub0, killed [[COPY11]], %subreg.sub1, killed [[COPY10]], %subreg.sub2, killed [[COPY9]], %subreg.sub3
+  ; GFx90A-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY7]], %subreg.sub1
+  ; GFx90A-NEXT:   [[REG_SEQUENCE4:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GFx90A-NEXT:   [[COPY13:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]]
+  ; GFx90A-NEXT:   [[BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN [[COPY13]], killed [[REG_SEQUENCE4]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8)
+  ; GFx90A-NEXT:   [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN]].sub0
+  ; GFx90A-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY14]], implicit $exec
+  ; GFx90A-NEXT:   [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN]].sub1
+  ; GFx90A-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY15]], implicit $exec
+  ; GFx90A-NEXT:   $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+  ; GFx90A-NEXT:   $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
+  ; GFx90A-NEXT:   SI_RETURN_TO_EPILOG $sgpr0, $sgpr1
   %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
   ret double %ret
 }
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.v2f16-no-rtn.ll b/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.v2f16-no-rtn.ll
index 790cd8ef9eccf..81e8230e79259 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.v2f16-no-rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.v2f16-no-rtn.ll
@@ -1,10 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX908 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX942 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX942 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefixes=GFX90A_GFX942,GFX90A %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefixes=GFX90A_GFX942,GFX942 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -enable-new-pm -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX908 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -enable-new-pm -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX942 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -enable-new-pm -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX942 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -enable-new-pm -stop-after=amdgpu-isel < %s | FileCheck -check-prefixes=GFX90A_GFX942,GFX90A %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -enable-new-pm -stop-after=amdgpu-isel < %s | FileCheck -check-prefixes=GFX90A_GFX942,GFX942 %s
 
 define amdgpu_ps void @buffer_atomic_fadd_v2f16_offset_no_rtn(<2 x half> %val, <4 x i32> inreg %rsrc, i32 inreg %soffset) {
   ; GFX908-LABEL: name: buffer_atomic_fadd_v2f16_offset_no_rtn
@@ -165,25 +165,41 @@ define amdgpu_ps void @buffer_ptr_atomic_fadd_v2f16_offset_no_rtn(<2 x half> %va
   ; GFX908-NEXT:   BUFFER_ATOMIC_PK_ADD_F16_OFFSET [[COPY5]], killed [[REG_SEQUENCE2]], [[COPY]], 4095, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
   ; GFX908-NEXT:   S_ENDPGM 0
   ;
-  ; GFX90A_GFX942-LABEL: name: buffer_ptr_atomic_fadd_v2f16_offset_no_rtn
-  ; GFX90A_GFX942: bb.0 (%ir-block.0):
-  ; GFX90A_GFX942-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4
-  ; GFX90A_GFX942-NEXT: {{  $}}
-  ; GFX90A_GFX942-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
-  ; GFX90A_GFX942-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-  ; GFX90A_GFX942-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-  ; GFX90A_GFX942-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-  ; GFX90A_GFX942-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-  ; GFX90A_GFX942-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY6:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY6]], %subreg.sub3
-  ; GFX90A_GFX942-NEXT:   BUFFER_ATOMIC_PK_ADD_F16_OFFSET [[COPY5]], killed [[REG_SEQUENCE2]], [[COPY]], 4095, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
-  ; GFX90A_GFX942-NEXT:   S_ENDPGM 0
+  ; GFX90A-LABEL: name: buffer_ptr_atomic_fadd_v2f16_offset_no_rtn
+  ; GFX90A: bb.0 (%ir-block.0):
+  ; GFX90A-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4
+  ; GFX90A-NEXT: {{  $}}
+  ; GFX90A-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+  ; GFX90A-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+  ; GFX90A-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; GFX90A-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; GFX90A-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; GFX90A-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GFX90A-NEXT:   [[COPY6:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+  ; GFX90A-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1
+  ; GFX90A-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1
+  ; GFX90A-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY6]], %subreg.sub3
+  ; GFX90A-NEXT:   BUFFER_ATOMIC_PK_ADD_F16_OFFSET [[COPY5]], killed [[REG_SEQUENCE2]], [[COPY]], 4095, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
+  ; GFX90A-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX942-LABEL: name: buffer_ptr_atomic_fadd_v2f16_offset_no_rtn
+  ; GFX942: bb.0 (%ir-block.0):
+  ; GFX942-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4
+  ; GFX942-NEXT: {{  $}}
+  ; GFX942-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+  ; GFX942-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+  ; GFX942-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; GFX942-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; GFX942-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; GFX942-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX942-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GFX942-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1
+  ; GFX942-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[REG_SEQUENCE1]], %subreg.sub0_sub1, killed [[REG_SEQUENCE]], %subreg.sub2_sub3
+  ; GFX942-NEXT:   BUFFER_ATOMIC_PK_ADD_F16_OFFSET [[COPY5]], killed [[REG_SEQUENCE2]], [[COPY]], 4095, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
+  ; GFX942-NEXT:   S_ENDPGM 0
   %ret = call <2 x half> @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 4095, i32 %soffset, i32 0)
   ret void
 }
@@ -210,26 +226,43 @@ define amdgpu_ps void @buffer_ptr_atomic_fadd_v2f16_offen_no_rtn(<2 x half> %val
   ; GFX908-NEXT:   BUFFER_ATOMIC_PK_ADD_F16_OFFEN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
   ; GFX908-NEXT:   S_ENDPGM 0
   ;
-  ; GFX90A_GFX942-LABEL: name: buffer_ptr_atomic_fadd_v2f16_offen_no_rtn
-  ; GFX90A_GFX942: bb.0 (%ir-block.0):
-  ; GFX90A_GFX942-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $sgpr4
-  ; GFX90A_GFX942-NEXT: {{  $}}
-  ; GFX90A_GFX942-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
-  ; GFX90A_GFX942-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-  ; GFX90A_GFX942-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-  ; GFX90A_GFX942-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-  ; GFX90A_GFX942-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-  ; GFX90A_GFX942-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-  ; GFX90A_GFX942-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3
-  ; GFX90A_GFX942-NEXT:   BUFFER_ATOMIC_PK_ADD_F16_OFFEN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
-  ; GFX90A_GFX942-NEXT:   S_ENDPGM 0
+  ; GFX90A-LABEL: name: buffer_ptr_atomic_fadd_v2f16_offen_no_rtn
+  ; GFX90A: bb.0 (%ir-block.0):
+  ; GFX90A-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $sgpr4
+  ; GFX90A-NEXT: {{  $}}
+  ; GFX90A-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+  ; GFX90A-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX90A-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+  ; GFX90A-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; GFX90A-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; GFX90A-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; GFX90A-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+  ; GFX90A-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+  ; GFX90A-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
+  ; GFX90A-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1
+  ; GFX90A-NEXT:   [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3
+  ; GFX90A-NEXT:   BUFFER_ATOMIC_PK_ADD_F16_OFFEN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
+  ; GFX90A-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX942-LABEL: name: buffer_ptr_atomic_fadd_v2f16_offen_no_rtn
+  ; GFX942: bb.0 (%ir-block.0):
+  ; GFX942-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $sgpr4
+  ; GFX942-NEXT: {{  $}}
+  ; GFX942-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+  ; GFX942-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX942-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+  ; GFX942-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; GFX942-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; GFX942-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; GFX942-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX942-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+  ; GFX942-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
+  ; GFX942-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[REG_SEQUENCE1]], %subreg.sub0_sub1, killed [[REG_SEQUENCE]], %subreg.sub2_sub3
+  ; GFX942-NEXT:   BUFFER_ATOMIC_PK_ADD_F16_OFFEN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
+  ; GFX942-NEXT:   S_ENDPGM 0
   %ret = call <2 x half> @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0)
   ret void
 }
@@ -256,26 +289,43 @@ define amdgpu_ps void @buffer_ptr_atomic_fadd_v2f16_idxen_no_rtn(<2 x half> %val
   ; GFX908-NEXT:   BUFFER_ATOMIC_PK_ADD_F16_IDXEN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
   ; GFX908-NEXT:   S_ENDPGM 0
   ;
-  ; GFX90A_GFX942-LABEL: name: buffer_ptr_atomic_fadd_v2f16_idxen_no_rtn
-  ; GFX90A_GFX942: bb.0 (%ir-block.0):
-  ; GFX90A_GFX942-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $sgpr4
-  ; GFX90A_GFX942-NEXT: {{  $}}
-  ; GFX90A_GFX942-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
-  ; GFX90A_GFX942-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-  ; GFX90A_GFX942-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-  ; GFX90A_GFX942-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-  ; GFX90A_GFX942-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-  ; GFX90A_GFX942-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-  ; GFX90A_GFX942-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3
-  ; GFX90A_GFX942-NEXT:   BUFFER_ATOMIC_PK_ADD_F16_IDXEN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
-  ; GFX90A_GFX942-NEXT:   S_ENDPGM 0
+  ; GFX90A-LABEL: name: buffer_ptr_atomic_fadd_v2f16_idxen_no_rtn
+  ; GFX90A: bb.0 (%ir-block.0):
+  ; GFX90A-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $sgpr4
+  ; GFX90A-NEXT: {{  $}}
+  ; GFX90A-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+  ; GFX90A-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX90A-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+  ; GFX90A-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; GFX90A-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; GFX90A-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; GFX90A-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+  ; GFX90A-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+  ; GFX90A-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
+  ; GFX90A-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1
+  ; GFX90A-NEXT:   [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3
+  ; GFX90A-NEXT:   BUFFER_ATOMIC_PK_ADD_F16_IDXEN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
+  ; GFX90A-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX942-LABEL: name: buffer_ptr_atomic_fadd_v2f16_idxen_no_rtn
+  ; GFX942: bb.0 (%ir-block.0):
+  ; GFX942-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $sgpr4
+  ; GFX942-NEXT: {{  $}}
+  ; GFX942-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+  ; GFX942-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX942-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+  ; GFX942-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; GFX942-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; GFX942-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; GFX942-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX942-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+  ; GFX942-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
+  ; GFX942-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[REG_SEQUENCE1]], %subreg.sub0_sub1, killed [[REG_SEQUENCE]], %subreg.sub2_sub3
+  ; GFX942-NEXT:   BUFFER_ATOMIC_PK_ADD_F16_IDXEN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
+  ; GFX942-NEXT:   S_ENDPGM 0
   %ret = call <2 x half> @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0)
   ret void
 }
@@ -304,28 +354,47 @@ define amdgpu_ps void @buffer_ptr_atomic_fadd_v2f16_bothen_no_rtn(<2 x half> %va
   ; GFX908-NEXT:   BUFFER_ATOMIC_PK_ADD_F16_BOTHEN [[COPY7]], killed [[REG_SEQUENCE3]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 2, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
   ; GFX908-NEXT:   S_ENDPGM 0
   ;
-  ; GFX90A_GFX942-LABEL: name: buffer_ptr_atomic_fadd_v2f16_bothen_no_rtn
-  ; GFX90A_GFX942: bb.0 (%ir-block.0):
-  ; GFX90A_GFX942-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $vgpr2, $sgpr4
-  ; GFX90A_GFX942-NEXT: {{  $}}
-  ; GFX90A_GFX942-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
-  ; GFX90A_GFX942-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-  ; GFX90A_GFX942-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-  ; GFX90A_GFX942-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-  ; GFX90A_GFX942-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-  ; GFX90A_GFX942-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-  ; GFX90A_GFX942-NEXT:   [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-  ; GFX90A_GFX942-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY11:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY11]], %subreg.sub0, killed [[COPY10]], %subreg.sub1, killed [[COPY9]], %subreg.sub2, killed [[COPY8]], %subreg.sub3
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
-  ; GFX90A_GFX942-NEXT:   BUFFER_ATOMIC_PK_ADD_F16_BOTHEN [[COPY7]], killed [[REG_SEQUENCE3]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 2, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
-  ; GFX90A_GFX942-NEXT:   S_ENDPGM 0
+  ; GFX90A-LABEL: name: buffer_ptr_atomic_fadd_v2f16_bothen_no_rtn
+  ; GFX90A: bb.0 (%ir-block.0):
+  ; GFX90A-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $vgpr2, $sgpr4
+  ; GFX90A-NEXT: {{  $}}
+  ; GFX90A-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+  ; GFX90A-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFX90A-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX90A-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+  ; GFX90A-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; GFX90A-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; GFX90A-NEXT:   [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; GFX90A-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1
+  ; GFX90A-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+  ; GFX90A-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1
+  ; GFX90A-NEXT:   [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1
+  ; GFX90A-NEXT:   [[COPY11:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY11]], %subreg.sub0, killed [[COPY10]], %subreg.sub1, killed [[COPY9]], %subreg.sub2, killed [[COPY8]], %subreg.sub3
+  ; GFX90A-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GFX90A-NEXT:   BUFFER_ATOMIC_PK_ADD_F16_BOTHEN [[COPY7]], killed [[REG_SEQUENCE3]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 2, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
+  ; GFX90A-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX942-LABEL: name: buffer_ptr_atomic_fadd_v2f16_bothen_no_rtn
+  ; GFX942: bb.0 (%ir-block.0):
+  ; GFX942-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $vgpr2, $sgpr4
+  ; GFX942-NEXT: {{  $}}
+  ; GFX942-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+  ; GFX942-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFX942-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX942-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+  ; GFX942-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; GFX942-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; GFX942-NEXT:   [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; GFX942-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX942-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1
+  ; GFX942-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1
+  ; GFX942-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[REG_SEQUENCE1]], %subreg.sub0_sub1, killed [[REG_SEQUENCE]], %subreg.sub2_sub3
+  ; GFX942-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GFX942-NEXT:   BUFFER_ATOMIC_PK_ADD_F16_BOTHEN [[COPY7]], killed [[REG_SEQUENCE3]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 2, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
+  ; GFX942-NEXT:   S_ENDPGM 0
   %ret = call <2 x half> @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2)
   ret void
 }
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.v2f16-rtn.ll b/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.v2f16-rtn.ll
index 89e1a4be4e16c..803c2fec3d4d5 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.v2f16-rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.v2f16-rtn.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX942 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX942 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -enable-new-pm -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX942 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -enable-new-pm -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX942 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefixes=GFX90A_GFX942,GFX90A %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefixes=GFX90A_GFX942,GFX942 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -enable-new-pm -stop-after=amdgpu-isel < %s | FileCheck -check-prefixes=GFX90A_GFX942,GFX90A %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -enable-new-pm -stop-after=amdgpu-isel < %s | FileCheck -check-prefixes=GFX90A_GFX942,GFX942 %s
 
 define amdgpu_ps <2 x half> @buffer_atomic_fadd_v2f16_offset_rtn(<2 x half> %val, <4 x i32> inreg %rsrc, i32 inreg %soffset) {
   ; GFX90A_GFX942-LABEL: name: buffer_atomic_fadd_v2f16_offset_rtn
@@ -86,106 +86,179 @@ define amdgpu_ps <2 x half> @buffer_atomic_fadd_v2f16_bothen_rtn(<2 x half> %val
 }
 
 define amdgpu_ps <2 x half> @buffer_ptr_atomic_fadd_v2f16_offset_rtn(<2 x half> %val, ptr addrspace(8) inreg %rsrc, i32 inreg %soffset) {
-  ; GFX90A_GFX942-LABEL: name: buffer_ptr_atomic_fadd_v2f16_offset_rtn
-  ; GFX90A_GFX942: bb.0 (%ir-block.0):
-  ; GFX90A_GFX942-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4
-  ; GFX90A_GFX942-NEXT: {{  $}}
-  ; GFX90A_GFX942-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
-  ; GFX90A_GFX942-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-  ; GFX90A_GFX942-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-  ; GFX90A_GFX942-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-  ; GFX90A_GFX942-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-  ; GFX90A_GFX942-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY6:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY6]], %subreg.sub3
-  ; GFX90A_GFX942-NEXT:   [[BUFFER_ATOMIC_PK_ADD_F16_OFFSET_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_PK_ADD_F16_OFFSET_RTN [[COPY5]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
-  ; GFX90A_GFX942-NEXT:   $vgpr0 = COPY [[BUFFER_ATOMIC_PK_ADD_F16_OFFSET_RTN]]
-  ; GFX90A_GFX942-NEXT:   SI_RETURN_TO_EPILOG $vgpr0
+  ; GFX90A-LABEL: name: buffer_ptr_atomic_fadd_v2f16_offset_rtn
+  ; GFX90A: bb.0 (%ir-block.0):
+  ; GFX90A-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4
+  ; GFX90A-NEXT: {{  $}}
+  ; GFX90A-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+  ; GFX90A-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+  ; GFX90A-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; GFX90A-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; GFX90A-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; GFX90A-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GFX90A-NEXT:   [[COPY6:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+  ; GFX90A-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1
+  ; GFX90A-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1
+  ; GFX90A-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY6]], %subreg.sub3
+  ; GFX90A-NEXT:   [[BUFFER_ATOMIC_PK_ADD_F16_OFFSET_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_PK_ADD_F16_OFFSET_RTN [[COPY5]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
+  ; GFX90A-NEXT:   $vgpr0 = COPY [[BUFFER_ATOMIC_PK_ADD_F16_OFFSET_RTN]]
+  ; GFX90A-NEXT:   SI_RETURN_TO_EPILOG $vgpr0
+  ;
+  ; GFX942-LABEL: name: buffer_ptr_atomic_fadd_v2f16_offset_rtn
+  ; GFX942: bb.0 (%ir-block.0):
+  ; GFX942-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4
+  ; GFX942-NEXT: {{  $}}
+  ; GFX942-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+  ; GFX942-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+  ; GFX942-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; GFX942-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; GFX942-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; GFX942-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX942-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GFX942-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1
+  ; GFX942-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[REG_SEQUENCE1]], %subreg.sub0_sub1, killed [[REG_SEQUENCE]], %subreg.sub2_sub3
+  ; GFX942-NEXT:   [[BUFFER_ATOMIC_PK_ADD_F16_OFFSET_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_PK_ADD_F16_OFFSET_RTN [[COPY5]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
+  ; GFX942-NEXT:   $vgpr0 = COPY [[BUFFER_ATOMIC_PK_ADD_F16_OFFSET_RTN]]
+  ; GFX942-NEXT:   SI_RETURN_TO_EPILOG $vgpr0
   %ret = call <2 x half> @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 0, i32 %soffset, i32 0)
   ret <2 x half> %ret
 }
 
 define amdgpu_ps <2 x half> @buffer_ptr_atomic_fadd_v2f16_offen_rtn(<2 x half> %val, ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) {
-  ; GFX90A_GFX942-LABEL: name: buffer_ptr_atomic_fadd_v2f16_offen_rtn
-  ; GFX90A_GFX942: bb.0 (%ir-block.0):
-  ; GFX90A_GFX942-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $sgpr4
-  ; GFX90A_GFX942-NEXT: {{  $}}
-  ; GFX90A_GFX942-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
-  ; GFX90A_GFX942-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-  ; GFX90A_GFX942-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-  ; GFX90A_GFX942-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-  ; GFX90A_GFX942-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-  ; GFX90A_GFX942-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-  ; GFX90A_GFX942-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3
-  ; GFX90A_GFX942-NEXT:   [[BUFFER_ATOMIC_PK_ADD_F16_OFFEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_PK_ADD_F16_OFFEN_RTN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
-  ; GFX90A_GFX942-NEXT:   $vgpr0 = COPY [[BUFFER_ATOMIC_PK_ADD_F16_OFFEN_RTN]]
-  ; GFX90A_GFX942-NEXT:   SI_RETURN_TO_EPILOG $vgpr0
+  ; GFX90A-LABEL: name: buffer_ptr_atomic_fadd_v2f16_offen_rtn
+  ; GFX90A: bb.0 (%ir-block.0):
+  ; GFX90A-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $sgpr4
+  ; GFX90A-NEXT: {{  $}}
+  ; GFX90A-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+  ; GFX90A-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX90A-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+  ; GFX90A-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; GFX90A-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; GFX90A-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; GFX90A-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+  ; GFX90A-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+  ; GFX90A-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
+  ; GFX90A-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1
+  ; GFX90A-NEXT:   [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3
+  ; GFX90A-NEXT:   [[BUFFER_ATOMIC_PK_ADD_F16_OFFEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_PK_ADD_F16_OFFEN_RTN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
+  ; GFX90A-NEXT:   $vgpr0 = COPY [[BUFFER_ATOMIC_PK_ADD_F16_OFFEN_RTN]]
+  ; GFX90A-NEXT:   SI_RETURN_TO_EPILOG $vgpr0
+  ;
+  ; GFX942-LABEL: name: buffer_ptr_atomic_fadd_v2f16_offen_rtn
+  ; GFX942: bb.0 (%ir-block.0):
+  ; GFX942-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $sgpr4
+  ; GFX942-NEXT: {{  $}}
+  ; GFX942-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+  ; GFX942-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX942-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+  ; GFX942-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; GFX942-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; GFX942-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; GFX942-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX942-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+  ; GFX942-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
+  ; GFX942-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[REG_SEQUENCE1]], %subreg.sub0_sub1, killed [[REG_SEQUENCE]], %subreg.sub2_sub3
+  ; GFX942-NEXT:   [[BUFFER_ATOMIC_PK_ADD_F16_OFFEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_PK_ADD_F16_OFFEN_RTN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
+  ; GFX942-NEXT:   $vgpr0 = COPY [[BUFFER_ATOMIC_PK_ADD_F16_OFFEN_RTN]]
+  ; GFX942-NEXT:   SI_RETURN_TO_EPILOG $vgpr0
   %ret = call <2 x half> @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0)
   ret <2 x half> %ret
 }
 
 define amdgpu_ps <2 x half> @buffer_ptr_atomic_fadd_v2f16_idxen_rtn(<2 x half> %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 inreg %soffset) {
-  ; GFX90A_GFX942-LABEL: name: buffer_ptr_atomic_fadd_v2f16_idxen_rtn
-  ; GFX90A_GFX942: bb.0 (%ir-block.0):
-  ; GFX90A_GFX942-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $sgpr4
-  ; GFX90A_GFX942-NEXT: {{  $}}
-  ; GFX90A_GFX942-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
-  ; GFX90A_GFX942-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-  ; GFX90A_GFX942-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-  ; GFX90A_GFX942-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-  ; GFX90A_GFX942-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-  ; GFX90A_GFX942-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-  ; GFX90A_GFX942-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3
-  ; GFX90A_GFX942-NEXT:   [[BUFFER_ATOMIC_PK_ADD_F16_IDXEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_PK_ADD_F16_IDXEN_RTN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
-  ; GFX90A_GFX942-NEXT:   $vgpr0 = COPY [[BUFFER_ATOMIC_PK_ADD_F16_IDXEN_RTN]]
-  ; GFX90A_GFX942-NEXT:   SI_RETURN_TO_EPILOG $vgpr0
+  ; GFX90A-LABEL: name: buffer_ptr_atomic_fadd_v2f16_idxen_rtn
+  ; GFX90A: bb.0 (%ir-block.0):
+  ; GFX90A-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $sgpr4
+  ; GFX90A-NEXT: {{  $}}
+  ; GFX90A-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+  ; GFX90A-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX90A-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+  ; GFX90A-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; GFX90A-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; GFX90A-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; GFX90A-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+  ; GFX90A-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+  ; GFX90A-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
+  ; GFX90A-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1
+  ; GFX90A-NEXT:   [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3
+  ; GFX90A-NEXT:   [[BUFFER_ATOMIC_PK_ADD_F16_IDXEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_PK_ADD_F16_IDXEN_RTN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
+  ; GFX90A-NEXT:   $vgpr0 = COPY [[BUFFER_ATOMIC_PK_ADD_F16_IDXEN_RTN]]
+  ; GFX90A-NEXT:   SI_RETURN_TO_EPILOG $vgpr0
+  ;
+  ; GFX942-LABEL: name: buffer_ptr_atomic_fadd_v2f16_idxen_rtn
+  ; GFX942: bb.0 (%ir-block.0):
+  ; GFX942-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $sgpr4
+  ; GFX942-NEXT: {{  $}}
+  ; GFX942-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+  ; GFX942-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX942-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+  ; GFX942-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; GFX942-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; GFX942-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; GFX942-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX942-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+  ; GFX942-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
+  ; GFX942-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[REG_SEQUENCE1]], %subreg.sub0_sub1, killed [[REG_SEQUENCE]], %subreg.sub2_sub3
+  ; GFX942-NEXT:   [[BUFFER_ATOMIC_PK_ADD_F16_IDXEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_PK_ADD_F16_IDXEN_RTN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
+  ; GFX942-NEXT:   $vgpr0 = COPY [[BUFFER_ATOMIC_PK_ADD_F16_IDXEN_RTN]]
+  ; GFX942-NEXT:   SI_RETURN_TO_EPILOG $vgpr0
   %ret = call <2 x half> @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0)
   ret <2 x half> %ret
 }
 
 define amdgpu_ps <2 x half> @buffer_ptr_atomic_fadd_v2f16_bothen_rtn(<2 x half> %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
-  ; GFX90A_GFX942-LABEL: name: buffer_ptr_atomic_fadd_v2f16_bothen_rtn
-  ; GFX90A_GFX942: bb.0 (%ir-block.0):
-  ; GFX90A_GFX942-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $vgpr2, $sgpr4
-  ; GFX90A_GFX942-NEXT: {{  $}}
-  ; GFX90A_GFX942-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
-  ; GFX90A_GFX942-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-  ; GFX90A_GFX942-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-  ; GFX90A_GFX942-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-  ; GFX90A_GFX942-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-  ; GFX90A_GFX942-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-  ; GFX90A_GFX942-NEXT:   [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-  ; GFX90A_GFX942-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY11:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY11]], %subreg.sub0, killed [[COPY10]], %subreg.sub1, killed [[COPY9]], %subreg.sub2, killed [[COPY8]], %subreg.sub3
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
-  ; GFX90A_GFX942-NEXT:   [[BUFFER_ATOMIC_PK_ADD_F16_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_PK_ADD_F16_BOTHEN_RTN [[COPY7]], killed [[REG_SEQUENCE3]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 3, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
-  ; GFX90A_GFX942-NEXT:   $vgpr0 = COPY [[BUFFER_ATOMIC_PK_ADD_F16_BOTHEN_RTN]]
-  ; GFX90A_GFX942-NEXT:   SI_RETURN_TO_EPILOG $vgpr0
+  ; GFX90A-LABEL: name: buffer_ptr_atomic_fadd_v2f16_bothen_rtn
+  ; GFX90A: bb.0 (%ir-block.0):
+  ; GFX90A-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $vgpr2, $sgpr4
+  ; GFX90A-NEXT: {{  $}}
+  ; GFX90A-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+  ; GFX90A-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFX90A-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX90A-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+  ; GFX90A-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; GFX90A-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; GFX90A-NEXT:   [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; GFX90A-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1
+  ; GFX90A-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+  ; GFX90A-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1
+  ; GFX90A-NEXT:   [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1
+  ; GFX90A-NEXT:   [[COPY11:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY11]], %subreg.sub0, killed [[COPY10]], %subreg.sub1, killed [[COPY9]], %subreg.sub2, killed [[COPY8]], %subreg.sub3
+  ; GFX90A-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GFX90A-NEXT:   [[BUFFER_ATOMIC_PK_ADD_F16_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_PK_ADD_F16_BOTHEN_RTN [[COPY7]], killed [[REG_SEQUENCE3]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 3, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
+  ; GFX90A-NEXT:   $vgpr0 = COPY [[BUFFER_ATOMIC_PK_ADD_F16_BOTHEN_RTN]]
+  ; GFX90A-NEXT:   SI_RETURN_TO_EPILOG $vgpr0
+  ;
+  ; GFX942-LABEL: name: buffer_ptr_atomic_fadd_v2f16_bothen_rtn
+  ; GFX942: bb.0 (%ir-block.0):
+  ; GFX942-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $vgpr2, $sgpr4
+  ; GFX942-NEXT: {{  $}}
+  ; GFX942-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+  ; GFX942-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFX942-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX942-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+  ; GFX942-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; GFX942-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; GFX942-NEXT:   [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; GFX942-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX942-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1
+  ; GFX942-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1
+  ; GFX942-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[REG_SEQUENCE1]], %subreg.sub0_sub1, killed [[REG_SEQUENCE]], %subreg.sub2_sub3
+  ; GFX942-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GFX942-NEXT:   [[BUFFER_ATOMIC_PK_ADD_F16_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_PK_ADD_F16_BOTHEN_RTN [[COPY7]], killed [[REG_SEQUENCE3]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 3, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
+  ; GFX942-NEXT:   $vgpr0 = COPY [[BUFFER_ATOMIC_PK_ADD_F16_BOTHEN_RTN]]
+  ; GFX942-NEXT:   SI_RETURN_TO_EPILOG $vgpr0
   %ret = call <2 x half> @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2)
   ret <2 x half> %ret
 }
diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
index b5e579b78a59c..2ae48f441fded 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
@@ -110,10 +110,8 @@ define amdgpu_kernel void @zero_init_kernel() {
 ;
 ; GFX942-LABEL: zero_init_kernel:
 ; GFX942:       ; %bb.0:
-; GFX942-NEXT:    s_mov_b32 s0, 0
-; GFX942-NEXT:    s_mov_b32 s1, s0
-; GFX942-NEXT:    s_mov_b32 s2, s0
-; GFX942-NEXT:    s_mov_b32 s3, s0
+; GFX942-NEXT:    s_mov_b64 s[0:1], 0
+; GFX942-NEXT:    s_mov_b64 s[2:3], s[0:1]
 ; GFX942-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX942-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:48
@@ -304,10 +302,8 @@ define void @zero_init_foo() {
 ; GFX942-LABEL: zero_init_foo:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    s_mov_b32 s0, 0
-; GFX942-NEXT:    s_mov_b32 s1, s0
-; GFX942-NEXT:    s_mov_b32 s2, s0
-; GFX942-NEXT:    s_mov_b32 s3, s0
+; GFX942-NEXT:    s_mov_b64 s[0:1], 0
+; GFX942-NEXT:    s_mov_b64 s[2:3], s[0:1]
 ; GFX942-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX942-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:48
@@ -1180,10 +1176,8 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() {
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    scratch_load_dword v0, off, off sc0 sc1
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    s_mov_b32 s0, 0
-; GFX942-NEXT:    s_mov_b32 s1, s0
-; GFX942-NEXT:    s_mov_b32 s2, s0
-; GFX942-NEXT:    s_mov_b32 s3, s0
+; GFX942-NEXT:    s_mov_b64 s[0:1], 0
+; GFX942-NEXT:    s_mov_b64 s[2:3], s[0:1]
 ; GFX942-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX942-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:256
@@ -1397,10 +1391,8 @@ define void @zero_init_small_offset_foo() {
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    scratch_load_dword v0, off, s32 sc0 sc1
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    s_mov_b32 s0, 0
-; GFX942-NEXT:    s_mov_b32 s1, s0
-; GFX942-NEXT:    s_mov_b32 s2, s0
-; GFX942-NEXT:    s_mov_b32 s3, s0
+; GFX942-NEXT:    s_mov_b64 s[0:1], 0
+; GFX942-NEXT:    s_mov_b64 s[2:3], s[0:1]
 ; GFX942-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX942-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:256
@@ -2420,10 +2412,8 @@ define amdgpu_kernel void @zero_init_large_offset_kernel() {
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    scratch_load_dword v0, off, off offset:4 sc0 sc1
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    s_mov_b32 s0, 0
-; GFX942-NEXT:    s_mov_b32 s1, s0
-; GFX942-NEXT:    s_mov_b32 s2, s0
-; GFX942-NEXT:    s_mov_b32 s3, s0
+; GFX942-NEXT:    s_mov_b64 s[0:1], 0
+; GFX942-NEXT:    s_mov_b64 s[2:3], s[0:1]
 ; GFX942-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX942-NEXT:    s_movk_i32 s0, 0x4004
@@ -2656,10 +2646,8 @@ define void @zero_init_large_offset_foo() {
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    scratch_load_dword v0, off, s32 offset:4 sc0 sc1
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    s_mov_b32 s0, 0
-; GFX942-NEXT:    s_mov_b32 s1, s0
-; GFX942-NEXT:    s_mov_b32 s2, s0
-; GFX942-NEXT:    s_mov_b32 s3, s0
+; GFX942-NEXT:    s_mov_b64 s[0:1], 0
+; GFX942-NEXT:    s_mov_b64 s[2:3], s[0:1]
 ; GFX942-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX942-NEXT:    s_add_i32 s0, s32, 0x4004
@@ -4139,8 +4127,7 @@ define void @store_load_i64_aligned(ptr addrspace(5) nocapture %arg) {
 ; GFX942-LABEL: store_load_i64_aligned:
 ; GFX942:       ; %bb.0: ; %bb
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b32_e32 v2, 15
-; GFX942-NEXT:    v_mov_b32_e32 v3, 0
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], 15
 ; GFX942-NEXT:    scratch_store_dwordx2 v0, v[2:3], off sc0 sc1
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    scratch_load_dwordx2 v[0:1], v0, off sc0 sc1
@@ -4250,8 +4237,7 @@ define void @store_load_i64_unaligned(ptr addrspace(5) nocapture %arg) {
 ; GFX942-LABEL: store_load_i64_unaligned:
 ; GFX942:       ; %bb.0: ; %bb
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b32_e32 v2, 15
-; GFX942-NEXT:    v_mov_b32_e32 v3, 0
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], 15
 ; GFX942-NEXT:    scratch_store_dwordx2 v0, v[2:3], off sc0 sc1
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    scratch_load_dwordx2 v[0:1], v0, off sc0 sc1
@@ -5010,10 +4996,8 @@ define amdgpu_ps void @large_offset() {
 ;
 ; GFX942-LABEL: large_offset:
 ; GFX942:       ; %bb.0: ; %bb
-; GFX942-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-NEXT:    v_mov_b32_e32 v1, v0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v0
-; GFX942-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], 0
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
 ; GFX942-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:3024 sc0 sc1
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    scratch_load_dwordx4 v[0:3], off, off offset:3024 sc0 sc1
diff --git a/llvm/test/CodeGen/AMDGPU/fmaximum3.ll b/llvm/test/CodeGen/AMDGPU/fmaximum3.ll
index 069a47ec97bfe..20e7a15c829a9 100644
--- a/llvm/test/CodeGen/AMDGPU/fmaximum3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmaximum3.ll
@@ -4010,8 +4010,8 @@ define <2 x double> @v_no_fmaximum3_f64__multi_use(double %a, double %b, double
 ; GFX9-NEXT:    v_max_f64 v[2:3], v[0:1], v[4:5]
 ; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
 ; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, 0, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v8, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, 0, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call double @llvm.maximum.f64(double %a, double %b)
   %max1 = call double @llvm.maximum.f64(double %max0, double %c)
diff --git a/llvm/test/CodeGen/AMDGPU/fminimum3.ll b/llvm/test/CodeGen/AMDGPU/fminimum3.ll
index d8746b58b16b7..8aebd6f719040 100644
--- a/llvm/test/CodeGen/AMDGPU/fminimum3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fminimum3.ll
@@ -4010,8 +4010,8 @@ define <2 x double> @v_no_fminimum3_f64__multi_use(double %a, double %b, double
 ; GFX9-NEXT:    v_min_f64 v[2:3], v[0:1], v[4:5]
 ; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
 ; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, 0, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v8, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, 0, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call double @llvm.minimum.f64(double %a, double %b)
   %max1 = call double @llvm.minimum.f64(double %max0, double %c)
diff --git a/llvm/test/CodeGen/AMDGPU/i1-to-bf16.ll b/llvm/test/CodeGen/AMDGPU/i1-to-bf16.ll
index 0dfeb3454dad5..cfbad1fa6742e 100644
--- a/llvm/test/CodeGen/AMDGPU/i1-to-bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/i1-to-bf16.ll
@@ -726,8 +726,8 @@ define amdgpu_ps <3 x i32> @s_uitofp_v3i1_to_v3bf16(<3 x i1> inreg %num) {
 ; GFX9-LABEL: s_uitofp_v3i1_to_v3bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_and_b32 s4, 1, s2
-; GFX9-NEXT:    s_and_b32 s2, 1, s1
-; GFX9-NEXT:    s_bitcmp1_b32 s0, 0
+; GFX9-NEXT:    s_and_b32 s2, 1, s0
+; GFX9-NEXT:    s_bitcmp1_b32 s1, 0
 ; GFX9-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GFX9-NEXT:    s_cmp_eq_u32 s2, 1
 ; GFX9-NEXT:    s_cselect_b64 s[2:3], -1, 0
@@ -760,8 +760,8 @@ define amdgpu_ps <3 x i32> @s_uitofp_v3i1_to_v3bf16(<3 x i1> inreg %num) {
 ; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v4, v3, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX9-NEXT:    v_readfirstlane_b32 s1, v1
-; GFX9-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX9-NEXT:    v_readfirstlane_b32 s1, v2
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: s_uitofp_v3i1_to_v3bf16:
@@ -1178,8 +1178,8 @@ define amdgpu_ps <4 x i32> @s_uitofp_v4i1_to_v4bf16(<4 x i1> inreg %num) {
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_and_b32 s6, 1, s3
 ; GFX9-NEXT:    s_and_b32 s4, 1, s2
-; GFX9-NEXT:    s_and_b32 s2, 1, s1
-; GFX9-NEXT:    s_bitcmp1_b32 s0, 0
+; GFX9-NEXT:    s_and_b32 s2, 1, s0
+; GFX9-NEXT:    s_bitcmp1_b32 s1, 0
 ; GFX9-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GFX9-NEXT:    s_cmp_eq_u32 s2, 1
 ; GFX9-NEXT:    s_cselect_b64 s[2:3], -1, 0
@@ -1223,8 +1223,8 @@ define amdgpu_ps <4 x i32> @s_uitofp_v4i1_to_v4bf16(<4 x i1> inreg %num) {
 ; GFX9-NEXT:    v_readfirstlane_b32 s2, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v4, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX9-NEXT:    v_readfirstlane_b32 s1, v2
-; GFX9-NEXT:    v_readfirstlane_b32 s0, v3
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX9-NEXT:    v_readfirstlane_b32 s1, v3
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: s_uitofp_v4i1_to_v4bf16:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll
index f971080e02c5b..090707eda3ca5 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll
@@ -561,17 +561,17 @@ define <2 x double> @v_maximum_v2f64(<2 x double> %src0, <2 x double> %src1) {
 ; GFX950-LABEL: v_maximum_v2f64:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_max_f64 v[8:9], v[0:1], v[4:5]
-; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
-; GFX950-NEXT:    v_max_f64 v[4:5], v[2:3], v[6:7]
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e64 v0, v8, 0, vcc
-; GFX950-NEXT:    v_mov_b32_e32 v8, 0x7ff80000
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v9, v8, vcc
+; GFX950-NEXT:    v_max_f64 v[8:9], v[2:3], v[6:7]
+; GFX950-NEXT:    v_mov_b32_e32 v10, 0x7ff80000
 ; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[2:3], v[6:7]
+; GFX950-NEXT:    v_max_f64 v[6:7], v[0:1], v[4:5]
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v3, v9, v10, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v2, v8, 0, vcc
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e64 v2, v4, 0, vcc
-; GFX950-NEXT:    v_cndmask_b32_e32 v3, v5, v8, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v7, v10, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v0, v6, 0, vcc
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximum_v2f64:
@@ -630,12 +630,19 @@ define <2 x double> @v_maximum_v2f64__nnan(<2 x double> %src0, <2 x double> %src
 ; GFX8-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_maximum_v2f64__nnan:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f64 v[0:1], v[0:1], v[4:5]
-; GFX9-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_maximum_v2f64__nnan:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_max_f64 v[0:1], v[0:1], v[4:5]
+; GFX900-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_maximum_v2f64__nnan:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
+; GFX950-NEXT:    v_max_f64 v[0:1], v[0:1], v[4:5]
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximum_v2f64__nnan:
 ; GFX10:       ; %bb.0:
@@ -711,17 +718,17 @@ define <2 x double> @v_maximum_v2f64__nsz(<2 x double> %src0, <2 x double> %src1
 ; GFX950-LABEL: v_maximum_v2f64__nsz:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_max_f64 v[8:9], v[0:1], v[4:5]
-; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
-; GFX950-NEXT:    v_max_f64 v[4:5], v[2:3], v[6:7]
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e64 v0, v8, 0, vcc
-; GFX950-NEXT:    v_mov_b32_e32 v8, 0x7ff80000
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v9, v8, vcc
+; GFX950-NEXT:    v_max_f64 v[8:9], v[2:3], v[6:7]
+; GFX950-NEXT:    v_mov_b32_e32 v10, 0x7ff80000
 ; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[2:3], v[6:7]
+; GFX950-NEXT:    v_max_f64 v[6:7], v[0:1], v[4:5]
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v3, v9, v10, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v2, v8, 0, vcc
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e64 v2, v4, 0, vcc
-; GFX950-NEXT:    v_cndmask_b32_e32 v3, v5, v8, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v7, v10, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v0, v6, 0, vcc
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximum_v2f64__nsz:
@@ -780,12 +787,19 @@ define <2 x double> @v_maximum_v2f64__nnan_nsz(<2 x double> %src0, <2 x double>
 ; GFX8-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_maximum_v2f64__nnan_nsz:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f64 v[0:1], v[0:1], v[4:5]
-; GFX9-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_maximum_v2f64__nnan_nsz:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_max_f64 v[0:1], v[0:1], v[4:5]
+; GFX900-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_maximum_v2f64__nnan_nsz:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
+; GFX950-NEXT:    v_max_f64 v[0:1], v[0:1], v[4:5]
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximum_v2f64__nnan_nsz:
 ; GFX10:       ; %bb.0:
@@ -1008,22 +1022,22 @@ define <3 x double> @v_maximum_v3f64(<3 x double> %src0, <3 x double> %src1) {
 ; GFX950-LABEL: v_maximum_v3f64:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_max_f64 v[12:13], v[0:1], v[6:7]
-; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[6:7]
-; GFX950-NEXT:    v_max_f64 v[6:7], v[2:3], v[8:9]
+; GFX950-NEXT:    v_max_f64 v[12:13], v[4:5], v[10:11]
+; GFX950-NEXT:    v_mov_b32_e32 v14, 0x7ff80000
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[4:5], v[10:11]
+; GFX950-NEXT:    v_max_f64 v[10:11], v[2:3], v[8:9]
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e64 v0, v12, 0, vcc
-; GFX950-NEXT:    v_mov_b32_e32 v12, 0x7ff80000
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v13, v12, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v5, v13, v14, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v4, v12, 0, vcc
 ; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[2:3], v[8:9]
+; GFX950-NEXT:    v_max_f64 v[8:9], v[0:1], v[6:7]
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v3, v11, v14, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v2, v10, 0, vcc
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[6:7]
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e64 v2, v6, 0, vcc
-; GFX950-NEXT:    v_cndmask_b32_e32 v3, v7, v12, vcc
-; GFX950-NEXT:    v_max_f64 v[6:7], v[4:5], v[10:11]
-; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[4:5], v[10:11]
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e64 v4, v6, 0, vcc
-; GFX950-NEXT:    v_cndmask_b32_e32 v5, v7, v12, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v9, v14, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v0, v8, 0, vcc
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximum_v3f64:
@@ -1092,13 +1106,21 @@ define <3 x double> @v_maximum_v3f64__nnan(<3 x double> %src0, <3 x double> %src
 ; GFX8-NEXT:    v_max_f64 v[4:5], v[4:5], v[10:11]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_maximum_v3f64__nnan:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f64 v[0:1], v[0:1], v[6:7]
-; GFX9-NEXT:    v_max_f64 v[2:3], v[2:3], v[8:9]
-; GFX9-NEXT:    v_max_f64 v[4:5], v[4:5], v[10:11]
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_maximum_v3f64__nnan:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_max_f64 v[0:1], v[0:1], v[6:7]
+; GFX900-NEXT:    v_max_f64 v[2:3], v[2:3], v[8:9]
+; GFX900-NEXT:    v_max_f64 v[4:5], v[4:5], v[10:11]
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_maximum_v3f64__nnan:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_max_f64 v[4:5], v[4:5], v[10:11]
+; GFX950-NEXT:    v_max_f64 v[2:3], v[2:3], v[8:9]
+; GFX950-NEXT:    v_max_f64 v[0:1], v[0:1], v[6:7]
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximum_v3f64__nnan:
 ; GFX10:       ; %bb.0:
@@ -1189,22 +1211,22 @@ define <3 x double> @v_maximum_v3f64__nsz(<3 x double> %src0, <3 x double> %src1
 ; GFX950-LABEL: v_maximum_v3f64__nsz:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_max_f64 v[12:13], v[0:1], v[6:7]
-; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[6:7]
-; GFX950-NEXT:    v_max_f64 v[6:7], v[2:3], v[8:9]
+; GFX950-NEXT:    v_max_f64 v[12:13], v[4:5], v[10:11]
+; GFX950-NEXT:    v_mov_b32_e32 v14, 0x7ff80000
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[4:5], v[10:11]
+; GFX950-NEXT:    v_max_f64 v[10:11], v[2:3], v[8:9]
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e64 v0, v12, 0, vcc
-; GFX950-NEXT:    v_mov_b32_e32 v12, 0x7ff80000
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v13, v12, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v5, v13, v14, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v4, v12, 0, vcc
 ; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[2:3], v[8:9]
+; GFX950-NEXT:    v_max_f64 v[8:9], v[0:1], v[6:7]
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v3, v11, v14, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v2, v10, 0, vcc
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[6:7]
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e64 v2, v6, 0, vcc
-; GFX950-NEXT:    v_cndmask_b32_e32 v3, v7, v12, vcc
-; GFX950-NEXT:    v_max_f64 v[6:7], v[4:5], v[10:11]
-; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[4:5], v[10:11]
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e64 v4, v6, 0, vcc
-; GFX950-NEXT:    v_cndmask_b32_e32 v5, v7, v12, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v9, v14, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v0, v8, 0, vcc
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximum_v3f64__nsz:
@@ -1273,13 +1295,21 @@ define <3 x double> @v_maximum_v3f64__nnan_nsz(<3 x double> %src0, <3 x double>
 ; GFX8-NEXT:    v_max_f64 v[4:5], v[4:5], v[10:11]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_maximum_v3f64__nnan_nsz:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f64 v[0:1], v[0:1], v[6:7]
-; GFX9-NEXT:    v_max_f64 v[2:3], v[2:3], v[8:9]
-; GFX9-NEXT:    v_max_f64 v[4:5], v[4:5], v[10:11]
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_maximum_v3f64__nnan_nsz:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_max_f64 v[0:1], v[0:1], v[6:7]
+; GFX900-NEXT:    v_max_f64 v[2:3], v[2:3], v[8:9]
+; GFX900-NEXT:    v_max_f64 v[4:5], v[4:5], v[10:11]
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_maximum_v3f64__nnan_nsz:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_max_f64 v[4:5], v[4:5], v[10:11]
+; GFX950-NEXT:    v_max_f64 v[2:3], v[2:3], v[8:9]
+; GFX950-NEXT:    v_max_f64 v[0:1], v[0:1], v[6:7]
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximum_v3f64__nnan_nsz:
 ; GFX10:       ; %bb.0:
@@ -1382,27 +1412,27 @@ define <4 x double> @v_maximum_v4f64(<4 x double> %src0, <4 x double> %src1) {
 ; GFX950-LABEL: v_maximum_v4f64:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_max_f64 v[16:17], v[0:1], v[8:9]
-; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[8:9]
-; GFX950-NEXT:    v_max_f64 v[8:9], v[2:3], v[10:11]
+; GFX950-NEXT:    v_max_f64 v[16:17], v[6:7], v[14:15]
+; GFX950-NEXT:    v_mov_b32_e32 v18, 0x7ff80000
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[6:7], v[14:15]
+; GFX950-NEXT:    v_max_f64 v[14:15], v[4:5], v[12:13]
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e64 v0, v16, 0, vcc
-; GFX950-NEXT:    v_mov_b32_e32 v16, 0x7ff80000
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v17, v16, vcc
-; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[2:3], v[10:11]
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e64 v2, v8, 0, vcc
-; GFX950-NEXT:    v_cndmask_b32_e32 v3, v9, v16, vcc
-; GFX950-NEXT:    v_max_f64 v[8:9], v[4:5], v[12:13]
+; GFX950-NEXT:    v_cndmask_b32_e32 v7, v17, v18, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v6, v16, 0, vcc
 ; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[4:5], v[12:13]
+; GFX950-NEXT:    v_max_f64 v[12:13], v[2:3], v[10:11]
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v5, v15, v18, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v4, v14, 0, vcc
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[2:3], v[10:11]
+; GFX950-NEXT:    v_max_f64 v[10:11], v[0:1], v[8:9]
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v3, v13, v18, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v2, v12, 0, vcc
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[8:9]
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e64 v4, v8, 0, vcc
-; GFX950-NEXT:    v_cndmask_b32_e32 v5, v9, v16, vcc
-; GFX950-NEXT:    v_max_f64 v[8:9], v[6:7], v[14:15]
-; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[6:7], v[14:15]
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e64 v6, v8, 0, vcc
-; GFX950-NEXT:    v_cndmask_b32_e32 v7, v9, v16, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v11, v18, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v0, v10, 0, vcc
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximum_v4f64:
@@ -1482,14 +1512,23 @@ define <4 x double> @v_maximum_v4f64__nnan(<4 x double> %src0, <4 x double> %src
 ; GFX8-NEXT:    v_max_f64 v[6:7], v[6:7], v[14:15]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_maximum_v4f64__nnan:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f64 v[0:1], v[0:1], v[8:9]
-; GFX9-NEXT:    v_max_f64 v[2:3], v[2:3], v[10:11]
-; GFX9-NEXT:    v_max_f64 v[4:5], v[4:5], v[12:13]
-; GFX9-NEXT:    v_max_f64 v[6:7], v[6:7], v[14:15]
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_maximum_v4f64__nnan:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_max_f64 v[0:1], v[0:1], v[8:9]
+; GFX900-NEXT:    v_max_f64 v[2:3], v[2:3], v[10:11]
+; GFX900-NEXT:    v_max_f64 v[4:5], v[4:5], v[12:13]
+; GFX900-NEXT:    v_max_f64 v[6:7], v[6:7], v[14:15]
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_maximum_v4f64__nnan:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_max_f64 v[6:7], v[6:7], v[14:15]
+; GFX950-NEXT:    v_max_f64 v[4:5], v[4:5], v[12:13]
+; GFX950-NEXT:    v_max_f64 v[2:3], v[2:3], v[10:11]
+; GFX950-NEXT:    v_max_f64 v[0:1], v[0:1], v[8:9]
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximum_v4f64__nnan:
 ; GFX10:       ; %bb.0:
@@ -1595,27 +1634,27 @@ define <4 x double> @v_maximum_v4f64__nsz(<4 x double> %src0, <4 x double> %src1
 ; GFX950-LABEL: v_maximum_v4f64__nsz:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_max_f64 v[16:17], v[0:1], v[8:9]
-; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[8:9]
-; GFX950-NEXT:    v_max_f64 v[8:9], v[2:3], v[10:11]
+; GFX950-NEXT:    v_max_f64 v[16:17], v[6:7], v[14:15]
+; GFX950-NEXT:    v_mov_b32_e32 v18, 0x7ff80000
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[6:7], v[14:15]
+; GFX950-NEXT:    v_max_f64 v[14:15], v[4:5], v[12:13]
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e64 v0, v16, 0, vcc
-; GFX950-NEXT:    v_mov_b32_e32 v16, 0x7ff80000
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v17, v16, vcc
-; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[2:3], v[10:11]
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e64 v2, v8, 0, vcc
-; GFX950-NEXT:    v_cndmask_b32_e32 v3, v9, v16, vcc
-; GFX950-NEXT:    v_max_f64 v[8:9], v[4:5], v[12:13]
+; GFX950-NEXT:    v_cndmask_b32_e32 v7, v17, v18, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v6, v16, 0, vcc
 ; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[4:5], v[12:13]
+; GFX950-NEXT:    v_max_f64 v[12:13], v[2:3], v[10:11]
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v5, v15, v18, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v4, v14, 0, vcc
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[2:3], v[10:11]
+; GFX950-NEXT:    v_max_f64 v[10:11], v[0:1], v[8:9]
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v3, v13, v18, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v2, v12, 0, vcc
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[8:9]
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e64 v4, v8, 0, vcc
-; GFX950-NEXT:    v_cndmask_b32_e32 v5, v9, v16, vcc
-; GFX950-NEXT:    v_max_f64 v[8:9], v[6:7], v[14:15]
-; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[6:7], v[14:15]
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e64 v6, v8, 0, vcc
-; GFX950-NEXT:    v_cndmask_b32_e32 v7, v9, v16, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v11, v18, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v0, v10, 0, vcc
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximum_v4f64__nsz:
@@ -1695,14 +1734,23 @@ define <4 x double> @v_maximum_v4f64__nnan_nsz(<4 x double> %src0, <4 x double>
 ; GFX8-NEXT:    v_max_f64 v[6:7], v[6:7], v[14:15]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_maximum_v4f64__nnan_nsz:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f64 v[0:1], v[0:1], v[8:9]
-; GFX9-NEXT:    v_max_f64 v[2:3], v[2:3], v[10:11]
-; GFX9-NEXT:    v_max_f64 v[4:5], v[4:5], v[12:13]
-; GFX9-NEXT:    v_max_f64 v[6:7], v[6:7], v[14:15]
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_maximum_v4f64__nnan_nsz:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_max_f64 v[0:1], v[0:1], v[8:9]
+; GFX900-NEXT:    v_max_f64 v[2:3], v[2:3], v[10:11]
+; GFX900-NEXT:    v_max_f64 v[4:5], v[4:5], v[12:13]
+; GFX900-NEXT:    v_max_f64 v[6:7], v[6:7], v[14:15]
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_maximum_v4f64__nnan_nsz:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_max_f64 v[6:7], v[6:7], v[14:15]
+; GFX950-NEXT:    v_max_f64 v[4:5], v[4:5], v[12:13]
+; GFX950-NEXT:    v_max_f64 v[2:3], v[2:3], v[10:11]
+; GFX950-NEXT:    v_max_f64 v[0:1], v[0:1], v[8:9]
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximum_v4f64__nnan_nsz:
 ; GFX10:       ; %bb.0:
@@ -1864,43 +1912,43 @@ define <8 x double> @v_maximum_v8f64(<8 x double> %src0, <8 x double> %src1) {
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-NEXT:    scratch_load_dword v31, off, s32
 ; GFX950-NEXT:    v_mov_b32_e32 v54, 0x7ff80000
-; GFX950-NEXT:    v_max_f64 v[32:33], v[0:1], v[16:17]
-; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[16:17]
-; GFX950-NEXT:    v_max_f64 v[34:35], v[2:3], v[18:19]
-; GFX950-NEXT:    v_max_f64 v[36:37], v[4:5], v[20:21]
-; GFX950-NEXT:    v_cndmask_b32_e64 v0, v32, 0, vcc
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v33, v54, vcc
-; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[2:3], v[18:19]
+; GFX950-NEXT:    v_max_f64 v[32:33], v[12:13], v[28:29]
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[12:13], v[28:29]
+; GFX950-NEXT:    v_max_f64 v[34:35], v[10:11], v[26:27]
+; GFX950-NEXT:    v_max_f64 v[36:37], v[8:9], v[24:25]
+; GFX950-NEXT:    v_cndmask_b32_e32 v13, v33, v54, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v12, v32, 0, vcc
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[10:11], v[26:27]
 ; GFX950-NEXT:    v_max_f64 v[38:39], v[6:7], v[22:23]
-; GFX950-NEXT:    v_max_f64 v[48:49], v[8:9], v[24:25]
-; GFX950-NEXT:    v_cndmask_b32_e64 v2, v34, 0, vcc
-; GFX950-NEXT:    v_cndmask_b32_e32 v3, v35, v54, vcc
-; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[4:5], v[20:21]
-; GFX950-NEXT:    v_max_f64 v[50:51], v[10:11], v[26:27]
-; GFX950-NEXT:    v_max_f64 v[52:53], v[12:13], v[28:29]
-; GFX950-NEXT:    v_cndmask_b32_e64 v4, v36, 0, vcc
-; GFX950-NEXT:    v_cndmask_b32_e32 v5, v37, v54, vcc
-; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[6:7], v[22:23]
-; GFX950-NEXT:    s_waitcnt vmcnt(0)
-; GFX950-NEXT:    v_max_f64 v[16:17], v[14:15], v[30:31]
-; GFX950-NEXT:    v_cndmask_b32_e64 v6, v38, 0, vcc
-; GFX950-NEXT:    v_cndmask_b32_e32 v7, v39, v54, vcc
+; GFX950-NEXT:    v_max_f64 v[48:49], v[4:5], v[20:21]
+; GFX950-NEXT:    v_cndmask_b32_e32 v11, v35, v54, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v10, v34, 0, vcc
 ; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[8:9], v[24:25]
+; GFX950-NEXT:    v_max_f64 v[50:51], v[2:3], v[18:19]
+; GFX950-NEXT:    v_max_f64 v[52:53], v[0:1], v[16:17]
+; GFX950-NEXT:    v_cndmask_b32_e32 v9, v37, v54, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v8, v36, 0, vcc
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[6:7], v[22:23]
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e64 v8, v48, 0, vcc
-; GFX950-NEXT:    v_cndmask_b32_e32 v9, v49, v54, vcc
-; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[10:11], v[26:27]
+; GFX950-NEXT:    v_cndmask_b32_e32 v7, v39, v54, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v6, v38, 0, vcc
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[4:5], v[20:21]
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e64 v10, v50, 0, vcc
-; GFX950-NEXT:    v_cndmask_b32_e32 v11, v51, v54, vcc
-; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[12:13], v[28:29]
+; GFX950-NEXT:    v_cndmask_b32_e32 v5, v49, v54, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v4, v48, 0, vcc
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[2:3], v[18:19]
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e64 v12, v52, 0, vcc
-; GFX950-NEXT:    v_cndmask_b32_e32 v13, v53, v54, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v3, v51, v54, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v2, v50, 0, vcc
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[16:17]
+; GFX950-NEXT:    s_waitcnt vmcnt(0)
+; GFX950-NEXT:    v_max_f64 v[16:17], v[14:15], v[30:31]
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v53, v54, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v0, v52, 0, vcc
 ; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[14:15], v[30:31]
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e64 v14, v16, 0, vcc
 ; GFX950-NEXT:    v_cndmask_b32_e32 v15, v17, v54, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v14, v16, 0, vcc
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximum_v8f64:
@@ -2371,152 +2419,144 @@ define <16 x double> @v_maximum_v16f64(<16 x double> %src0, <16 x double> %src1)
 ; GFX950-NEXT:    v_accvgpr_write_b32 a3, v43 ; Reload Reuse
 ; GFX950-NEXT:    v_accvgpr_write_b32 a4, v44 ; Reload Reuse
 ; GFX950-NEXT:    v_accvgpr_write_b32 a5, v45 ; Reload Reuse
-; GFX950-NEXT:    v_accvgpr_write_b32 a6, v46 ; Reload Reuse
-; GFX950-NEXT:    v_accvgpr_write_b32 a7, v47 ; Reload Reuse
 ; GFX950-NEXT:    v_accvgpr_write_b32 a8, v56 ; Reload Reuse
 ; GFX950-NEXT:    v_accvgpr_write_b32 a9, v57 ; Reload Reuse
-; GFX950-NEXT:    scratch_load_dword v33, off, s32 offset:8
-; GFX950-NEXT:    scratch_load_dword v32, off, s32 offset:4
-; GFX950-NEXT:    scratch_load_dword v37, off, s32 offset:16
-; GFX950-NEXT:    scratch_load_dword v36, off, s32 offset:12
-; GFX950-NEXT:    scratch_load_dword v39, off, s32 offset:24
-; GFX950-NEXT:    scratch_load_dword v38, off, s32 offset:20
+; GFX950-NEXT:    scratch_load_dword v41, off, s32 offset:120
+; GFX950-NEXT:    scratch_load_dword v40, off, s32 offset:116
 ; GFX950-NEXT:    scratch_load_dword v57, off, s32 offset:32
 ; GFX950-NEXT:    scratch_load_dword v56, off, s32 offset:28
-; GFX950-NEXT:    scratch_load_dword v47, off, s32 offset:40
-; GFX950-NEXT:    scratch_load_dword v46, off, s32 offset:36
-; GFX950-NEXT:    scratch_load_dword v45, off, s32 offset:48
-; GFX950-NEXT:    scratch_load_dword v44, off, s32 offset:44
-; GFX950-NEXT:    scratch_load_dword v43, off, s32 offset:56
-; GFX950-NEXT:    scratch_load_dword v42, off, s32 offset:52
-; GFX950-NEXT:    scratch_load_dword v41, off, s32 offset:64
-; GFX950-NEXT:    scratch_load_dword v40, off, s32 offset:60
-; GFX950-NEXT:    scratch_load_dword v55, off, s32 offset:72
-; GFX950-NEXT:    scratch_load_dword v54, off, s32 offset:68
-; GFX950-NEXT:    scratch_load_dword v53, off, s32 offset:80
-; GFX950-NEXT:    scratch_load_dword v52, off, s32 offset:76
-; GFX950-NEXT:    scratch_load_dword v51, off, s32 offset:88
-; GFX950-NEXT:    scratch_load_dword v50, off, s32 offset:84
-; GFX950-NEXT:    scratch_load_dword v49, off, s32 offset:96
-; GFX950-NEXT:    scratch_load_dword v48, off, s32 offset:92
+; GFX950-NEXT:    scratch_load_dword v45, off, s32 offset:24
+; GFX950-NEXT:    scratch_load_dword v44, off, s32 offset:20
+; GFX950-NEXT:    scratch_load_dword v43, off, s32 offset:16
+; GFX950-NEXT:    scratch_load_dword v42, off, s32 offset:12
+; GFX950-NEXT:    scratch_load_dword v53, off, s32 offset:8
+; GFX950-NEXT:    scratch_load_dword v52, off, s32 offset:4
+; GFX950-NEXT:    scratch_load_dword v49, off, s32 offset:112
 ; GFX950-NEXT:    scratch_load_dword v31, off, s32
-; GFX950-NEXT:    scratch_load_dword v35, off, s32 offset:104
-; GFX950-NEXT:    scratch_load_dword v34, off, s32 offset:100
+; GFX950-NEXT:    scratch_load_dword v55, off, s32 offset:128
+; GFX950-NEXT:    scratch_load_dword v54, off, s32 offset:124
+; GFX950-NEXT:    scratch_load_dword v48, off, s32 offset:108
+; GFX950-NEXT:    scratch_load_dword v51, off, s32 offset:104
+; GFX950-NEXT:    scratch_load_dword v50, off, s32 offset:100
+; GFX950-NEXT:    scratch_load_dword v39, off, s32 offset:96
+; GFX950-NEXT:    scratch_load_dword v38, off, s32 offset:92
+; GFX950-NEXT:    scratch_load_dword v37, off, s32 offset:88
+; GFX950-NEXT:    scratch_load_dword v36, off, s32 offset:84
+; GFX950-NEXT:    scratch_load_dword v35, off, s32 offset:80
+; GFX950-NEXT:    scratch_load_dword v34, off, s32 offset:76
+; GFX950-NEXT:    scratch_load_dword v33, off, s32 offset:72
+; GFX950-NEXT:    scratch_load_dword v32, off, s32 offset:68
+; GFX950-NEXT:    v_accvgpr_write_b32 a6, v46 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_write_b32 a7, v47 ; Reload Reuse
 ; GFX950-NEXT:    v_accvgpr_write_b32 a10, v58 ; Reload Reuse
 ; GFX950-NEXT:    v_accvgpr_write_b32 a11, v59 ; Reload Reuse
 ; GFX950-NEXT:    v_accvgpr_write_b32 a12, v60 ; Reload Reuse
-; GFX950-NEXT:    v_accvgpr_write_b32 a13, v61 ; Reload Reuse
-; GFX950-NEXT:    v_accvgpr_write_b32 a14, v62 ; Reload Reuse
-; GFX950-NEXT:    v_accvgpr_write_b32 a15, v63 ; Reload Reuse
-; GFX950-NEXT:    s_waitcnt vmcnt(25)
-; GFX950-NEXT:    v_max_f64 v[58:59], v[0:1], v[32:33]
-; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[32:33]
-; GFX950-NEXT:    scratch_load_dword v33, off, s32 offset:112
-; GFX950-NEXT:    scratch_load_dword v32, off, s32 offset:108
-; GFX950-NEXT:    s_waitcnt vmcnt(25)
-; GFX950-NEXT:    v_max_f64 v[60:61], v[2:3], v[36:37]
-; GFX950-NEXT:    v_cmp_u_f64_e64 s[0:1], v[2:3], v[36:37]
-; GFX950-NEXT:    scratch_load_dword v37, off, s32 offset:120
-; GFX950-NEXT:    scratch_load_dword v36, off, s32 offset:116
-; GFX950-NEXT:    s_waitcnt vmcnt(25)
-; GFX950-NEXT:    v_max_f64 v[62:63], v[4:5], v[38:39]
-; GFX950-NEXT:    v_cmp_u_f64_e64 s[2:3], v[4:5], v[38:39]
-; GFX950-NEXT:    scratch_load_dword v39, off, s32 offset:128
-; GFX950-NEXT:    scratch_load_dword v38, off, s32 offset:124
-; GFX950-NEXT:    v_mov_b32_e32 v2, 0x7ff80000
-; GFX950-NEXT:    s_waitcnt vmcnt(25)
-; GFX950-NEXT:    v_max_f64 v[0:1], v[6:7], v[56:57]
-; GFX950-NEXT:    v_cmp_u_f64_e64 s[4:5], v[6:7], v[56:57]
+; GFX950-NEXT:    v_mov_b32_e32 v60, 0x7ff80000
 ; GFX950-NEXT:    s_waitcnt vmcnt(23)
-; GFX950-NEXT:    v_max_f64 v[56:57], v[8:9], v[46:47]
-; GFX950-NEXT:    v_cndmask_b32_e64 v58, v58, 0, vcc
-; GFX950-NEXT:    v_cndmask_b32_e32 v59, v59, v2, vcc
-; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[8:9], v[46:47]
-; GFX950-NEXT:    v_cndmask_b32_e64 v6, v0, 0, s[4:5]
-; GFX950-NEXT:    v_cndmask_b32_e64 v7, v1, v2, s[4:5]
-; GFX950-NEXT:    v_cndmask_b32_e64 v8, v56, 0, vcc
-; GFX950-NEXT:    v_cndmask_b32_e32 v9, v57, v2, vcc
-; GFX950-NEXT:    s_waitcnt vmcnt(21)
-; GFX950-NEXT:    v_max_f64 v[0:1], v[10:11], v[44:45]
-; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[10:11], v[44:45]
-; GFX950-NEXT:    v_cndmask_b32_e64 v60, v60, 0, s[0:1]
-; GFX950-NEXT:    v_cndmask_b32_e64 v3, v61, v2, s[0:1]
-; GFX950-NEXT:    v_cndmask_b32_e64 v10, v0, 0, vcc
-; GFX950-NEXT:    v_cndmask_b32_e32 v11, v1, v2, vcc
+; GFX950-NEXT:    v_max_f64 v[46:47], v[28:29], v[40:41]
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[28:29], v[40:41]
+; GFX950-NEXT:    scratch_load_dword v41, off, s32 offset:64
+; GFX950-NEXT:    scratch_load_dword v40, off, s32 offset:60
+; GFX950-NEXT:    s_waitcnt vmcnt(23)
+; GFX950-NEXT:    v_max_f64 v[58:59], v[6:7], v[56:57]
+; GFX950-NEXT:    v_cmp_u_f64_e64 s[0:1], v[6:7], v[56:57]
+; GFX950-NEXT:    scratch_load_dword v7, off, s32 offset:56
+; GFX950-NEXT:    scratch_load_dword v6, off, s32 offset:52
+; GFX950-NEXT:    s_waitcnt vmcnt(23)
+; GFX950-NEXT:    v_max_f64 v[56:57], v[4:5], v[44:45]
+; GFX950-NEXT:    v_cmp_u_f64_e64 s[2:3], v[4:5], v[44:45]
+; GFX950-NEXT:    scratch_load_dword v5, off, s32 offset:48
+; GFX950-NEXT:    scratch_load_dword v4, off, s32 offset:44
+; GFX950-NEXT:    s_waitcnt vmcnt(23)
+; GFX950-NEXT:    v_max_f64 v[44:45], v[2:3], v[42:43]
+; GFX950-NEXT:    v_cmp_u_f64_e64 s[4:5], v[2:3], v[42:43]
+; GFX950-NEXT:    scratch_load_dword v3, off, s32 offset:40
+; GFX950-NEXT:    scratch_load_dword v2, off, s32 offset:36
+; GFX950-NEXT:    s_waitcnt vmcnt(23)
+; GFX950-NEXT:    v_max_f64 v[42:43], v[0:1], v[52:53]
+; GFX950-NEXT:    v_cmp_u_f64_e64 s[6:7], v[0:1], v[52:53]
 ; GFX950-NEXT:    s_waitcnt vmcnt(19)
-; GFX950-NEXT:    v_max_f64 v[0:1], v[12:13], v[42:43]
-; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[12:13], v[42:43]
-; GFX950-NEXT:    v_cndmask_b32_e64 v4, v62, 0, s[2:3]
-; GFX950-NEXT:    v_cndmask_b32_e64 v5, v63, v2, s[2:3]
-; GFX950-NEXT:    v_cndmask_b32_e64 v12, v0, 0, vcc
-; GFX950-NEXT:    v_cndmask_b32_e32 v13, v1, v2, vcc
-; GFX950-NEXT:    s_waitcnt vmcnt(17)
-; GFX950-NEXT:    v_max_f64 v[0:1], v[14:15], v[40:41]
-; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[14:15], v[40:41]
-; GFX950-NEXT:    v_accvgpr_read_b32 v63, a15 ; Reload Reuse
-; GFX950-NEXT:    v_accvgpr_read_b32 v62, a14 ; Reload Reuse
-; GFX950-NEXT:    v_cndmask_b32_e64 v14, v0, 0, vcc
-; GFX950-NEXT:    v_cndmask_b32_e32 v15, v1, v2, vcc
-; GFX950-NEXT:    s_waitcnt vmcnt(15)
-; GFX950-NEXT:    v_max_f64 v[0:1], v[16:17], v[54:55]
-; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[16:17], v[54:55]
-; GFX950-NEXT:    v_accvgpr_read_b32 v61, a13 ; Reload Reuse
-; GFX950-NEXT:    v_accvgpr_read_b32 v57, a9 ; Reload Reuse
-; GFX950-NEXT:    v_cndmask_b32_e64 v16, v0, 0, vcc
-; GFX950-NEXT:    v_cndmask_b32_e32 v17, v1, v2, vcc
-; GFX950-NEXT:    s_waitcnt vmcnt(13)
-; GFX950-NEXT:    v_max_f64 v[0:1], v[18:19], v[52:53]
-; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[18:19], v[52:53]
-; GFX950-NEXT:    v_accvgpr_read_b32 v56, a8 ; Reload Reuse
+; GFX950-NEXT:    v_max_f64 v[0:1], v[30:31], v[54:55]
+; GFX950-NEXT:    s_waitcnt vmcnt(18)
+; GFX950-NEXT:    v_max_f64 v[52:53], v[26:27], v[48:49]
+; GFX950-NEXT:    v_cmp_u_f64_e64 s[8:9], v[30:31], v[54:55]
+; GFX950-NEXT:    v_cndmask_b32_e32 v29, v47, v60, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v28, v46, 0, vcc
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[26:27], v[48:49]
+; GFX950-NEXT:    v_cndmask_b32_e64 v31, v1, v60, s[8:9]
+; GFX950-NEXT:    v_cndmask_b32_e64 v30, v0, 0, s[8:9]
+; GFX950-NEXT:    v_cndmask_b32_e32 v27, v53, v60, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v26, v52, 0, vcc
+; GFX950-NEXT:    s_waitcnt vmcnt(16)
+; GFX950-NEXT:    v_max_f64 v[0:1], v[24:25], v[50:51]
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[24:25], v[50:51]
 ; GFX950-NEXT:    v_accvgpr_read_b32 v47, a7 ; Reload Reuse
-; GFX950-NEXT:    v_cndmask_b32_e64 v18, v0, 0, vcc
-; GFX950-NEXT:    v_cndmask_b32_e32 v19, v1, v2, vcc
-; GFX950-NEXT:    s_waitcnt vmcnt(11)
-; GFX950-NEXT:    v_max_f64 v[0:1], v[20:21], v[50:51]
-; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[20:21], v[50:51]
 ; GFX950-NEXT:    v_accvgpr_read_b32 v46, a6 ; Reload Reuse
-; GFX950-NEXT:    v_accvgpr_read_b32 v45, a5 ; Reload Reuse
-; GFX950-NEXT:    v_cndmask_b32_e64 v20, v0, 0, vcc
-; GFX950-NEXT:    v_cndmask_b32_e32 v21, v1, v2, vcc
-; GFX950-NEXT:    s_waitcnt vmcnt(9)
-; GFX950-NEXT:    v_max_f64 v[0:1], v[22:23], v[48:49]
-; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[22:23], v[48:49]
-; GFX950-NEXT:    v_accvgpr_read_b32 v44, a4 ; Reload Reuse
-; GFX950-NEXT:    v_accvgpr_read_b32 v43, a3 ; Reload Reuse
+; GFX950-NEXT:    v_cndmask_b32_e32 v25, v1, v60, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v24, v0, 0, vcc
+; GFX950-NEXT:    s_waitcnt vmcnt(14)
+; GFX950-NEXT:    v_max_f64 v[0:1], v[22:23], v[38:39]
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[22:23], v[38:39]
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v23, v1, v60, vcc
 ; GFX950-NEXT:    v_cndmask_b32_e64 v22, v0, 0, vcc
-; GFX950-NEXT:    v_cndmask_b32_e32 v23, v1, v2, vcc
+; GFX950-NEXT:    s_waitcnt vmcnt(12)
+; GFX950-NEXT:    v_max_f64 v[0:1], v[20:21], v[36:37]
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[20:21], v[36:37]
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v21, v1, v60, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v20, v0, 0, vcc
+; GFX950-NEXT:    s_waitcnt vmcnt(10)
+; GFX950-NEXT:    v_max_f64 v[0:1], v[18:19], v[34:35]
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[18:19], v[34:35]
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v19, v1, v60, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v18, v0, 0, vcc
+; GFX950-NEXT:    s_waitcnt vmcnt(8)
+; GFX950-NEXT:    v_max_f64 v[0:1], v[16:17], v[32:33]
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[16:17], v[32:33]
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v17, v1, v60, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v16, v0, 0, vcc
 ; GFX950-NEXT:    s_waitcnt vmcnt(6)
-; GFX950-NEXT:    v_max_f64 v[0:1], v[24:25], v[34:35]
-; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[24:25], v[34:35]
-; GFX950-NEXT:    v_accvgpr_read_b32 v42, a2 ; Reload Reuse
+; GFX950-NEXT:    v_max_f64 v[0:1], v[14:15], v[40:41]
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[14:15], v[40:41]
 ; GFX950-NEXT:    v_accvgpr_read_b32 v41, a1 ; Reload Reuse
-; GFX950-NEXT:    v_cndmask_b32_e64 v24, v0, 0, vcc
-; GFX950-NEXT:    v_cndmask_b32_e32 v25, v1, v2, vcc
 ; GFX950-NEXT:    v_accvgpr_read_b32 v40, a0 ; Reload Reuse
+; GFX950-NEXT:    v_cndmask_b32_e32 v15, v1, v60, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v14, v0, 0, vcc
 ; GFX950-NEXT:    s_waitcnt vmcnt(4)
-; GFX950-NEXT:    v_max_f64 v[0:1], v[26:27], v[32:33]
-; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[26:27], v[32:33]
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e64 v26, v0, 0, vcc
-; GFX950-NEXT:    v_cndmask_b32_e32 v27, v1, v2, vcc
+; GFX950-NEXT:    v_max_f64 v[0:1], v[12:13], v[6:7]
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[12:13], v[6:7]
+; GFX950-NEXT:    v_cndmask_b32_e64 v7, v59, v60, s[0:1]
+; GFX950-NEXT:    v_cndmask_b32_e64 v6, v58, 0, s[0:1]
+; GFX950-NEXT:    v_cndmask_b32_e32 v13, v1, v60, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v12, v0, 0, vcc
 ; GFX950-NEXT:    s_waitcnt vmcnt(2)
-; GFX950-NEXT:    v_max_f64 v[0:1], v[28:29], v[36:37]
-; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[28:29], v[36:37]
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e64 v28, v0, 0, vcc
-; GFX950-NEXT:    v_cndmask_b32_e32 v29, v1, v2, vcc
+; GFX950-NEXT:    v_max_f64 v[0:1], v[10:11], v[4:5]
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[10:11], v[4:5]
+; GFX950-NEXT:    v_cndmask_b32_e64 v5, v57, v60, s[2:3]
+; GFX950-NEXT:    v_cndmask_b32_e64 v4, v56, 0, s[2:3]
+; GFX950-NEXT:    v_cndmask_b32_e32 v11, v1, v60, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v10, v0, 0, vcc
 ; GFX950-NEXT:    s_waitcnt vmcnt(0)
-; GFX950-NEXT:    v_max_f64 v[0:1], v[30:31], v[38:39]
-; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[30:31], v[38:39]
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e64 v30, v0, 0, vcc
-; GFX950-NEXT:    v_cndmask_b32_e32 v31, v1, v2, vcc
-; GFX950-NEXT:    v_mov_b32_e32 v0, v58
-; GFX950-NEXT:    v_mov_b32_e32 v1, v59
-; GFX950-NEXT:    v_mov_b32_e32 v2, v60
+; GFX950-NEXT:    v_max_f64 v[0:1], v[8:9], v[2:3]
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[8:9], v[2:3]
+; GFX950-NEXT:    v_cndmask_b32_e64 v3, v45, v60, s[4:5]
+; GFX950-NEXT:    v_cndmask_b32_e64 v2, v44, 0, s[4:5]
+; GFX950-NEXT:    v_cndmask_b32_e32 v9, v1, v60, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v8, v0, 0, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v1, v43, v60, s[6:7]
+; GFX950-NEXT:    v_cndmask_b32_e64 v0, v42, 0, s[6:7]
 ; GFX950-NEXT:    v_accvgpr_read_b32 v60, a12 ; Reload Reuse
 ; GFX950-NEXT:    v_accvgpr_read_b32 v59, a11 ; Reload Reuse
 ; GFX950-NEXT:    v_accvgpr_read_b32 v58, a10 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_read_b32 v57, a9 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_read_b32 v56, a8 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_read_b32 v45, a5 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_read_b32 v44, a4 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_read_b32 v43, a3 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_read_b32 v42, a2 ; Reload Reuse
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximum_v16f64:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll
index dfd67873c3b86..b119dd425463b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll
@@ -561,17 +561,17 @@ define <2 x double> @v_minimum_v2f64(<2 x double> %src0, <2 x double> %src1) {
 ; GFX950-LABEL: v_minimum_v2f64:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_min_f64 v[8:9], v[0:1], v[4:5]
-; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
-; GFX950-NEXT:    v_min_f64 v[4:5], v[2:3], v[6:7]
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e64 v0, v8, 0, vcc
-; GFX950-NEXT:    v_mov_b32_e32 v8, 0x7ff80000
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v9, v8, vcc
+; GFX950-NEXT:    v_min_f64 v[8:9], v[2:3], v[6:7]
+; GFX950-NEXT:    v_mov_b32_e32 v10, 0x7ff80000
 ; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[2:3], v[6:7]
+; GFX950-NEXT:    v_min_f64 v[6:7], v[0:1], v[4:5]
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v3, v9, v10, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v2, v8, 0, vcc
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e64 v2, v4, 0, vcc
-; GFX950-NEXT:    v_cndmask_b32_e32 v3, v5, v8, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v7, v10, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v0, v6, 0, vcc
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimum_v2f64:
@@ -630,12 +630,19 @@ define <2 x double> @v_minimum_v2f64__nnan(<2 x double> %src0, <2 x double> %src
 ; GFX8-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_minimum_v2f64__nnan:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
-; GFX9-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_minimum_v2f64__nnan:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
+; GFX900-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_minimum_v2f64__nnan:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
+; GFX950-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimum_v2f64__nnan:
 ; GFX10:       ; %bb.0:
@@ -711,17 +718,17 @@ define <2 x double> @v_minimum_v2f64__nsz(<2 x double> %src0, <2 x double> %src1
 ; GFX950-LABEL: v_minimum_v2f64__nsz:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_min_f64 v[8:9], v[0:1], v[4:5]
-; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
-; GFX950-NEXT:    v_min_f64 v[4:5], v[2:3], v[6:7]
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e64 v0, v8, 0, vcc
-; GFX950-NEXT:    v_mov_b32_e32 v8, 0x7ff80000
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v9, v8, vcc
+; GFX950-NEXT:    v_min_f64 v[8:9], v[2:3], v[6:7]
+; GFX950-NEXT:    v_mov_b32_e32 v10, 0x7ff80000
 ; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[2:3], v[6:7]
+; GFX950-NEXT:    v_min_f64 v[6:7], v[0:1], v[4:5]
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v3, v9, v10, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v2, v8, 0, vcc
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e64 v2, v4, 0, vcc
-; GFX950-NEXT:    v_cndmask_b32_e32 v3, v5, v8, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v7, v10, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v0, v6, 0, vcc
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimum_v2f64__nsz:
@@ -780,12 +787,19 @@ define <2 x double> @v_minimum_v2f64__nnan_nsz(<2 x double> %src0, <2 x double>
 ; GFX8-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_minimum_v2f64__nnan_nsz:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
-; GFX9-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_minimum_v2f64__nnan_nsz:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
+; GFX900-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_minimum_v2f64__nnan_nsz:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
+; GFX950-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimum_v2f64__nnan_nsz:
 ; GFX10:       ; %bb.0:
@@ -1008,22 +1022,22 @@ define <3 x double> @v_minimum_v3f64(<3 x double> %src0, <3 x double> %src1) {
 ; GFX950-LABEL: v_minimum_v3f64:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_min_f64 v[12:13], v[0:1], v[6:7]
-; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[6:7]
-; GFX950-NEXT:    v_min_f64 v[6:7], v[2:3], v[8:9]
+; GFX950-NEXT:    v_min_f64 v[12:13], v[4:5], v[10:11]
+; GFX950-NEXT:    v_mov_b32_e32 v14, 0x7ff80000
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[4:5], v[10:11]
+; GFX950-NEXT:    v_min_f64 v[10:11], v[2:3], v[8:9]
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e64 v0, v12, 0, vcc
-; GFX950-NEXT:    v_mov_b32_e32 v12, 0x7ff80000
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v13, v12, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v5, v13, v14, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v4, v12, 0, vcc
 ; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[2:3], v[8:9]
+; GFX950-NEXT:    v_min_f64 v[8:9], v[0:1], v[6:7]
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v3, v11, v14, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v2, v10, 0, vcc
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[6:7]
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e64 v2, v6, 0, vcc
-; GFX950-NEXT:    v_cndmask_b32_e32 v3, v7, v12, vcc
-; GFX950-NEXT:    v_min_f64 v[6:7], v[4:5], v[10:11]
-; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[4:5], v[10:11]
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e64 v4, v6, 0, vcc
-; GFX950-NEXT:    v_cndmask_b32_e32 v5, v7, v12, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v9, v14, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v0, v8, 0, vcc
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimum_v3f64:
@@ -1092,13 +1106,21 @@ define <3 x double> @v_minimum_v3f64__nnan(<3 x double> %src0, <3 x double> %src
 ; GFX8-NEXT:    v_min_f64 v[4:5], v[4:5], v[10:11]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_minimum_v3f64__nnan:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_min_f64 v[0:1], v[0:1], v[6:7]
-; GFX9-NEXT:    v_min_f64 v[2:3], v[2:3], v[8:9]
-; GFX9-NEXT:    v_min_f64 v[4:5], v[4:5], v[10:11]
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_minimum_v3f64__nnan:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_min_f64 v[0:1], v[0:1], v[6:7]
+; GFX900-NEXT:    v_min_f64 v[2:3], v[2:3], v[8:9]
+; GFX900-NEXT:    v_min_f64 v[4:5], v[4:5], v[10:11]
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_minimum_v3f64__nnan:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_min_f64 v[4:5], v[4:5], v[10:11]
+; GFX950-NEXT:    v_min_f64 v[2:3], v[2:3], v[8:9]
+; GFX950-NEXT:    v_min_f64 v[0:1], v[0:1], v[6:7]
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimum_v3f64__nnan:
 ; GFX10:       ; %bb.0:
@@ -1189,22 +1211,22 @@ define <3 x double> @v_minimum_v3f64__nsz(<3 x double> %src0, <3 x double> %src1
 ; GFX950-LABEL: v_minimum_v3f64__nsz:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_min_f64 v[12:13], v[0:1], v[6:7]
-; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[6:7]
-; GFX950-NEXT:    v_min_f64 v[6:7], v[2:3], v[8:9]
+; GFX950-NEXT:    v_min_f64 v[12:13], v[4:5], v[10:11]
+; GFX950-NEXT:    v_mov_b32_e32 v14, 0x7ff80000
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[4:5], v[10:11]
+; GFX950-NEXT:    v_min_f64 v[10:11], v[2:3], v[8:9]
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e64 v0, v12, 0, vcc
-; GFX950-NEXT:    v_mov_b32_e32 v12, 0x7ff80000
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v13, v12, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v5, v13, v14, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v4, v12, 0, vcc
 ; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[2:3], v[8:9]
+; GFX950-NEXT:    v_min_f64 v[8:9], v[0:1], v[6:7]
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v3, v11, v14, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v2, v10, 0, vcc
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[6:7]
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e64 v2, v6, 0, vcc
-; GFX950-NEXT:    v_cndmask_b32_e32 v3, v7, v12, vcc
-; GFX950-NEXT:    v_min_f64 v[6:7], v[4:5], v[10:11]
-; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[4:5], v[10:11]
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e64 v4, v6, 0, vcc
-; GFX950-NEXT:    v_cndmask_b32_e32 v5, v7, v12, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v9, v14, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v0, v8, 0, vcc
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimum_v3f64__nsz:
@@ -1273,13 +1295,21 @@ define <3 x double> @v_minimum_v3f64__nnan_nsz(<3 x double> %src0, <3 x double>
 ; GFX8-NEXT:    v_min_f64 v[4:5], v[4:5], v[10:11]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_minimum_v3f64__nnan_nsz:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_min_f64 v[0:1], v[0:1], v[6:7]
-; GFX9-NEXT:    v_min_f64 v[2:3], v[2:3], v[8:9]
-; GFX9-NEXT:    v_min_f64 v[4:5], v[4:5], v[10:11]
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_minimum_v3f64__nnan_nsz:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_min_f64 v[0:1], v[0:1], v[6:7]
+; GFX900-NEXT:    v_min_f64 v[2:3], v[2:3], v[8:9]
+; GFX900-NEXT:    v_min_f64 v[4:5], v[4:5], v[10:11]
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_minimum_v3f64__nnan_nsz:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_min_f64 v[4:5], v[4:5], v[10:11]
+; GFX950-NEXT:    v_min_f64 v[2:3], v[2:3], v[8:9]
+; GFX950-NEXT:    v_min_f64 v[0:1], v[0:1], v[6:7]
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimum_v3f64__nnan_nsz:
 ; GFX10:       ; %bb.0:
@@ -1382,27 +1412,27 @@ define <4 x double> @v_minimum_v4f64(<4 x double> %src0, <4 x double> %src1) {
 ; GFX950-LABEL: v_minimum_v4f64:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_min_f64 v[16:17], v[0:1], v[8:9]
-; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[8:9]
-; GFX950-NEXT:    v_min_f64 v[8:9], v[2:3], v[10:11]
+; GFX950-NEXT:    v_min_f64 v[16:17], v[6:7], v[14:15]
+; GFX950-NEXT:    v_mov_b32_e32 v18, 0x7ff80000
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[6:7], v[14:15]
+; GFX950-NEXT:    v_min_f64 v[14:15], v[4:5], v[12:13]
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e64 v0, v16, 0, vcc
-; GFX950-NEXT:    v_mov_b32_e32 v16, 0x7ff80000
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v17, v16, vcc
-; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[2:3], v[10:11]
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e64 v2, v8, 0, vcc
-; GFX950-NEXT:    v_cndmask_b32_e32 v3, v9, v16, vcc
-; GFX950-NEXT:    v_min_f64 v[8:9], v[4:5], v[12:13]
+; GFX950-NEXT:    v_cndmask_b32_e32 v7, v17, v18, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v6, v16, 0, vcc
 ; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[4:5], v[12:13]
+; GFX950-NEXT:    v_min_f64 v[12:13], v[2:3], v[10:11]
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v5, v15, v18, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v4, v14, 0, vcc
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[2:3], v[10:11]
+; GFX950-NEXT:    v_min_f64 v[10:11], v[0:1], v[8:9]
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v3, v13, v18, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v2, v12, 0, vcc
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[8:9]
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e64 v4, v8, 0, vcc
-; GFX950-NEXT:    v_cndmask_b32_e32 v5, v9, v16, vcc
-; GFX950-NEXT:    v_min_f64 v[8:9], v[6:7], v[14:15]
-; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[6:7], v[14:15]
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e64 v6, v8, 0, vcc
-; GFX950-NEXT:    v_cndmask_b32_e32 v7, v9, v16, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v11, v18, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v0, v10, 0, vcc
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimum_v4f64:
@@ -1482,14 +1512,23 @@ define <4 x double> @v_minimum_v4f64__nnan(<4 x double> %src0, <4 x double> %src
 ; GFX8-NEXT:    v_min_f64 v[6:7], v[6:7], v[14:15]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_minimum_v4f64__nnan:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_min_f64 v[0:1], v[0:1], v[8:9]
-; GFX9-NEXT:    v_min_f64 v[2:3], v[2:3], v[10:11]
-; GFX9-NEXT:    v_min_f64 v[4:5], v[4:5], v[12:13]
-; GFX9-NEXT:    v_min_f64 v[6:7], v[6:7], v[14:15]
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_minimum_v4f64__nnan:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_min_f64 v[0:1], v[0:1], v[8:9]
+; GFX900-NEXT:    v_min_f64 v[2:3], v[2:3], v[10:11]
+; GFX900-NEXT:    v_min_f64 v[4:5], v[4:5], v[12:13]
+; GFX900-NEXT:    v_min_f64 v[6:7], v[6:7], v[14:15]
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_minimum_v4f64__nnan:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_min_f64 v[6:7], v[6:7], v[14:15]
+; GFX950-NEXT:    v_min_f64 v[4:5], v[4:5], v[12:13]
+; GFX950-NEXT:    v_min_f64 v[2:3], v[2:3], v[10:11]
+; GFX950-NEXT:    v_min_f64 v[0:1], v[0:1], v[8:9]
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimum_v4f64__nnan:
 ; GFX10:       ; %bb.0:
@@ -1595,27 +1634,27 @@ define <4 x double> @v_minimum_v4f64__nsz(<4 x double> %src0, <4 x double> %src1
 ; GFX950-LABEL: v_minimum_v4f64__nsz:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_min_f64 v[16:17], v[0:1], v[8:9]
-; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[8:9]
-; GFX950-NEXT:    v_min_f64 v[8:9], v[2:3], v[10:11]
+; GFX950-NEXT:    v_min_f64 v[16:17], v[6:7], v[14:15]
+; GFX950-NEXT:    v_mov_b32_e32 v18, 0x7ff80000
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[6:7], v[14:15]
+; GFX950-NEXT:    v_min_f64 v[14:15], v[4:5], v[12:13]
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e64 v0, v16, 0, vcc
-; GFX950-NEXT:    v_mov_b32_e32 v16, 0x7ff80000
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v17, v16, vcc
-; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[2:3], v[10:11]
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e64 v2, v8, 0, vcc
-; GFX950-NEXT:    v_cndmask_b32_e32 v3, v9, v16, vcc
-; GFX950-NEXT:    v_min_f64 v[8:9], v[4:5], v[12:13]
+; GFX950-NEXT:    v_cndmask_b32_e32 v7, v17, v18, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v6, v16, 0, vcc
 ; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[4:5], v[12:13]
+; GFX950-NEXT:    v_min_f64 v[12:13], v[2:3], v[10:11]
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v5, v15, v18, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v4, v14, 0, vcc
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[2:3], v[10:11]
+; GFX950-NEXT:    v_min_f64 v[10:11], v[0:1], v[8:9]
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v3, v13, v18, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v2, v12, 0, vcc
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[8:9]
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e64 v4, v8, 0, vcc
-; GFX950-NEXT:    v_cndmask_b32_e32 v5, v9, v16, vcc
-; GFX950-NEXT:    v_min_f64 v[8:9], v[6:7], v[14:15]
-; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[6:7], v[14:15]
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e64 v6, v8, 0, vcc
-; GFX950-NEXT:    v_cndmask_b32_e32 v7, v9, v16, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v11, v18, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v0, v10, 0, vcc
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimum_v4f64__nsz:
@@ -1695,14 +1734,23 @@ define <4 x double> @v_minimum_v4f64__nnan_nsz(<4 x double> %src0, <4 x double>
 ; GFX8-NEXT:    v_min_f64 v[6:7], v[6:7], v[14:15]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_minimum_v4f64__nnan_nsz:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_min_f64 v[0:1], v[0:1], v[8:9]
-; GFX9-NEXT:    v_min_f64 v[2:3], v[2:3], v[10:11]
-; GFX9-NEXT:    v_min_f64 v[4:5], v[4:5], v[12:13]
-; GFX9-NEXT:    v_min_f64 v[6:7], v[6:7], v[14:15]
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_minimum_v4f64__nnan_nsz:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_min_f64 v[0:1], v[0:1], v[8:9]
+; GFX900-NEXT:    v_min_f64 v[2:3], v[2:3], v[10:11]
+; GFX900-NEXT:    v_min_f64 v[4:5], v[4:5], v[12:13]
+; GFX900-NEXT:    v_min_f64 v[6:7], v[6:7], v[14:15]
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_minimum_v4f64__nnan_nsz:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_min_f64 v[6:7], v[6:7], v[14:15]
+; GFX950-NEXT:    v_min_f64 v[4:5], v[4:5], v[12:13]
+; GFX950-NEXT:    v_min_f64 v[2:3], v[2:3], v[10:11]
+; GFX950-NEXT:    v_min_f64 v[0:1], v[0:1], v[8:9]
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimum_v4f64__nnan_nsz:
 ; GFX10:       ; %bb.0:
@@ -1864,43 +1912,43 @@ define <8 x double> @v_minimum_v8f64(<8 x double> %src0, <8 x double> %src1) {
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-NEXT:    scratch_load_dword v31, off, s32
 ; GFX950-NEXT:    v_mov_b32_e32 v54, 0x7ff80000
-; GFX950-NEXT:    v_min_f64 v[32:33], v[0:1], v[16:17]
-; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[16:17]
-; GFX950-NEXT:    v_min_f64 v[34:35], v[2:3], v[18:19]
-; GFX950-NEXT:    v_min_f64 v[36:37], v[4:5], v[20:21]
-; GFX950-NEXT:    v_cndmask_b32_e64 v0, v32, 0, vcc
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v33, v54, vcc
-; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[2:3], v[18:19]
+; GFX950-NEXT:    v_min_f64 v[32:33], v[12:13], v[28:29]
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[12:13], v[28:29]
+; GFX950-NEXT:    v_min_f64 v[34:35], v[10:11], v[26:27]
+; GFX950-NEXT:    v_min_f64 v[36:37], v[8:9], v[24:25]
+; GFX950-NEXT:    v_cndmask_b32_e32 v13, v33, v54, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v12, v32, 0, vcc
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[10:11], v[26:27]
 ; GFX950-NEXT:    v_min_f64 v[38:39], v[6:7], v[22:23]
-; GFX950-NEXT:    v_min_f64 v[48:49], v[8:9], v[24:25]
-; GFX950-NEXT:    v_cndmask_b32_e64 v2, v34, 0, vcc
-; GFX950-NEXT:    v_cndmask_b32_e32 v3, v35, v54, vcc
-; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[4:5], v[20:21]
-; GFX950-NEXT:    v_min_f64 v[50:51], v[10:11], v[26:27]
-; GFX950-NEXT:    v_min_f64 v[52:53], v[12:13], v[28:29]
-; GFX950-NEXT:    v_cndmask_b32_e64 v4, v36, 0, vcc
-; GFX950-NEXT:    v_cndmask_b32_e32 v5, v37, v54, vcc
-; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[6:7], v[22:23]
-; GFX950-NEXT:    s_waitcnt vmcnt(0)
-; GFX950-NEXT:    v_min_f64 v[16:17], v[14:15], v[30:31]
-; GFX950-NEXT:    v_cndmask_b32_e64 v6, v38, 0, vcc
-; GFX950-NEXT:    v_cndmask_b32_e32 v7, v39, v54, vcc
+; GFX950-NEXT:    v_min_f64 v[48:49], v[4:5], v[20:21]
+; GFX950-NEXT:    v_cndmask_b32_e32 v11, v35, v54, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v10, v34, 0, vcc
 ; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[8:9], v[24:25]
+; GFX950-NEXT:    v_min_f64 v[50:51], v[2:3], v[18:19]
+; GFX950-NEXT:    v_min_f64 v[52:53], v[0:1], v[16:17]
+; GFX950-NEXT:    v_cndmask_b32_e32 v9, v37, v54, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v8, v36, 0, vcc
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[6:7], v[22:23]
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e64 v8, v48, 0, vcc
-; GFX950-NEXT:    v_cndmask_b32_e32 v9, v49, v54, vcc
-; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[10:11], v[26:27]
+; GFX950-NEXT:    v_cndmask_b32_e32 v7, v39, v54, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v6, v38, 0, vcc
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[4:5], v[20:21]
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e64 v10, v50, 0, vcc
-; GFX950-NEXT:    v_cndmask_b32_e32 v11, v51, v54, vcc
-; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[12:13], v[28:29]
+; GFX950-NEXT:    v_cndmask_b32_e32 v5, v49, v54, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v4, v48, 0, vcc
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[2:3], v[18:19]
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e64 v12, v52, 0, vcc
-; GFX950-NEXT:    v_cndmask_b32_e32 v13, v53, v54, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v3, v51, v54, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v2, v50, 0, vcc
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[16:17]
+; GFX950-NEXT:    s_waitcnt vmcnt(0)
+; GFX950-NEXT:    v_min_f64 v[16:17], v[14:15], v[30:31]
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v53, v54, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v0, v52, 0, vcc
 ; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[14:15], v[30:31]
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e64 v14, v16, 0, vcc
 ; GFX950-NEXT:    v_cndmask_b32_e32 v15, v17, v54, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v14, v16, 0, vcc
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimum_v8f64:
@@ -2371,152 +2419,144 @@ define <16 x double> @v_minimum_v16f64(<16 x double> %src0, <16 x double> %src1)
 ; GFX950-NEXT:    v_accvgpr_write_b32 a3, v43 ; Reload Reuse
 ; GFX950-NEXT:    v_accvgpr_write_b32 a4, v44 ; Reload Reuse
 ; GFX950-NEXT:    v_accvgpr_write_b32 a5, v45 ; Reload Reuse
-; GFX950-NEXT:    v_accvgpr_write_b32 a6, v46 ; Reload Reuse
-; GFX950-NEXT:    v_accvgpr_write_b32 a7, v47 ; Reload Reuse
 ; GFX950-NEXT:    v_accvgpr_write_b32 a8, v56 ; Reload Reuse
 ; GFX950-NEXT:    v_accvgpr_write_b32 a9, v57 ; Reload Reuse
-; GFX950-NEXT:    scratch_load_dword v33, off, s32 offset:8
-; GFX950-NEXT:    scratch_load_dword v32, off, s32 offset:4
-; GFX950-NEXT:    scratch_load_dword v37, off, s32 offset:16
-; GFX950-NEXT:    scratch_load_dword v36, off, s32 offset:12
-; GFX950-NEXT:    scratch_load_dword v39, off, s32 offset:24
-; GFX950-NEXT:    scratch_load_dword v38, off, s32 offset:20
+; GFX950-NEXT:    scratch_load_dword v41, off, s32 offset:120
+; GFX950-NEXT:    scratch_load_dword v40, off, s32 offset:116
 ; GFX950-NEXT:    scratch_load_dword v57, off, s32 offset:32
 ; GFX950-NEXT:    scratch_load_dword v56, off, s32 offset:28
-; GFX950-NEXT:    scratch_load_dword v47, off, s32 offset:40
-; GFX950-NEXT:    scratch_load_dword v46, off, s32 offset:36
-; GFX950-NEXT:    scratch_load_dword v45, off, s32 offset:48
-; GFX950-NEXT:    scratch_load_dword v44, off, s32 offset:44
-; GFX950-NEXT:    scratch_load_dword v43, off, s32 offset:56
-; GFX950-NEXT:    scratch_load_dword v42, off, s32 offset:52
-; GFX950-NEXT:    scratch_load_dword v41, off, s32 offset:64
-; GFX950-NEXT:    scratch_load_dword v40, off, s32 offset:60
-; GFX950-NEXT:    scratch_load_dword v55, off, s32 offset:72
-; GFX950-NEXT:    scratch_load_dword v54, off, s32 offset:68
-; GFX950-NEXT:    scratch_load_dword v53, off, s32 offset:80
-; GFX950-NEXT:    scratch_load_dword v52, off, s32 offset:76
-; GFX950-NEXT:    scratch_load_dword v51, off, s32 offset:88
-; GFX950-NEXT:    scratch_load_dword v50, off, s32 offset:84
-; GFX950-NEXT:    scratch_load_dword v49, off, s32 offset:96
-; GFX950-NEXT:    scratch_load_dword v48, off, s32 offset:92
+; GFX950-NEXT:    scratch_load_dword v45, off, s32 offset:24
+; GFX950-NEXT:    scratch_load_dword v44, off, s32 offset:20
+; GFX950-NEXT:    scratch_load_dword v43, off, s32 offset:16
+; GFX950-NEXT:    scratch_load_dword v42, off, s32 offset:12
+; GFX950-NEXT:    scratch_load_dword v53, off, s32 offset:8
+; GFX950-NEXT:    scratch_load_dword v52, off, s32 offset:4
+; GFX950-NEXT:    scratch_load_dword v49, off, s32 offset:112
 ; GFX950-NEXT:    scratch_load_dword v31, off, s32
-; GFX950-NEXT:    scratch_load_dword v35, off, s32 offset:104
-; GFX950-NEXT:    scratch_load_dword v34, off, s32 offset:100
+; GFX950-NEXT:    scratch_load_dword v55, off, s32 offset:128
+; GFX950-NEXT:    scratch_load_dword v54, off, s32 offset:124
+; GFX950-NEXT:    scratch_load_dword v48, off, s32 offset:108
+; GFX950-NEXT:    scratch_load_dword v51, off, s32 offset:104
+; GFX950-NEXT:    scratch_load_dword v50, off, s32 offset:100
+; GFX950-NEXT:    scratch_load_dword v39, off, s32 offset:96
+; GFX950-NEXT:    scratch_load_dword v38, off, s32 offset:92
+; GFX950-NEXT:    scratch_load_dword v37, off, s32 offset:88
+; GFX950-NEXT:    scratch_load_dword v36, off, s32 offset:84
+; GFX950-NEXT:    scratch_load_dword v35, off, s32 offset:80
+; GFX950-NEXT:    scratch_load_dword v34, off, s32 offset:76
+; GFX950-NEXT:    scratch_load_dword v33, off, s32 offset:72
+; GFX950-NEXT:    scratch_load_dword v32, off, s32 offset:68
+; GFX950-NEXT:    v_accvgpr_write_b32 a6, v46 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_write_b32 a7, v47 ; Reload Reuse
 ; GFX950-NEXT:    v_accvgpr_write_b32 a10, v58 ; Reload Reuse
 ; GFX950-NEXT:    v_accvgpr_write_b32 a11, v59 ; Reload Reuse
 ; GFX950-NEXT:    v_accvgpr_write_b32 a12, v60 ; Reload Reuse
-; GFX950-NEXT:    v_accvgpr_write_b32 a13, v61 ; Reload Reuse
-; GFX950-NEXT:    v_accvgpr_write_b32 a14, v62 ; Reload Reuse
-; GFX950-NEXT:    v_accvgpr_write_b32 a15, v63 ; Reload Reuse
-; GFX950-NEXT:    s_waitcnt vmcnt(25)
-; GFX950-NEXT:    v_min_f64 v[58:59], v[0:1], v[32:33]
-; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[32:33]
-; GFX950-NEXT:    scratch_load_dword v33, off, s32 offset:112
-; GFX950-NEXT:    scratch_load_dword v32, off, s32 offset:108
-; GFX950-NEXT:    s_waitcnt vmcnt(25)
-; GFX950-NEXT:    v_min_f64 v[60:61], v[2:3], v[36:37]
-; GFX950-NEXT:    v_cmp_u_f64_e64 s[0:1], v[2:3], v[36:37]
-; GFX950-NEXT:    scratch_load_dword v37, off, s32 offset:120
-; GFX950-NEXT:    scratch_load_dword v36, off, s32 offset:116
-; GFX950-NEXT:    s_waitcnt vmcnt(25)
-; GFX950-NEXT:    v_min_f64 v[62:63], v[4:5], v[38:39]
-; GFX950-NEXT:    v_cmp_u_f64_e64 s[2:3], v[4:5], v[38:39]
-; GFX950-NEXT:    scratch_load_dword v39, off, s32 offset:128
-; GFX950-NEXT:    scratch_load_dword v38, off, s32 offset:124
-; GFX950-NEXT:    v_mov_b32_e32 v2, 0x7ff80000
-; GFX950-NEXT:    s_waitcnt vmcnt(25)
-; GFX950-NEXT:    v_min_f64 v[0:1], v[6:7], v[56:57]
-; GFX950-NEXT:    v_cmp_u_f64_e64 s[4:5], v[6:7], v[56:57]
+; GFX950-NEXT:    v_mov_b32_e32 v60, 0x7ff80000
 ; GFX950-NEXT:    s_waitcnt vmcnt(23)
-; GFX950-NEXT:    v_min_f64 v[56:57], v[8:9], v[46:47]
-; GFX950-NEXT:    v_cndmask_b32_e64 v58, v58, 0, vcc
-; GFX950-NEXT:    v_cndmask_b32_e32 v59, v59, v2, vcc
-; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[8:9], v[46:47]
-; GFX950-NEXT:    v_cndmask_b32_e64 v6, v0, 0, s[4:5]
-; GFX950-NEXT:    v_cndmask_b32_e64 v7, v1, v2, s[4:5]
-; GFX950-NEXT:    v_cndmask_b32_e64 v8, v56, 0, vcc
-; GFX950-NEXT:    v_cndmask_b32_e32 v9, v57, v2, vcc
-; GFX950-NEXT:    s_waitcnt vmcnt(21)
-; GFX950-NEXT:    v_min_f64 v[0:1], v[10:11], v[44:45]
-; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[10:11], v[44:45]
-; GFX950-NEXT:    v_cndmask_b32_e64 v60, v60, 0, s[0:1]
-; GFX950-NEXT:    v_cndmask_b32_e64 v3, v61, v2, s[0:1]
-; GFX950-NEXT:    v_cndmask_b32_e64 v10, v0, 0, vcc
-; GFX950-NEXT:    v_cndmask_b32_e32 v11, v1, v2, vcc
+; GFX950-NEXT:    v_min_f64 v[46:47], v[28:29], v[40:41]
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[28:29], v[40:41]
+; GFX950-NEXT:    scratch_load_dword v41, off, s32 offset:64
+; GFX950-NEXT:    scratch_load_dword v40, off, s32 offset:60
+; GFX950-NEXT:    s_waitcnt vmcnt(23)
+; GFX950-NEXT:    v_min_f64 v[58:59], v[6:7], v[56:57]
+; GFX950-NEXT:    v_cmp_u_f64_e64 s[0:1], v[6:7], v[56:57]
+; GFX950-NEXT:    scratch_load_dword v7, off, s32 offset:56
+; GFX950-NEXT:    scratch_load_dword v6, off, s32 offset:52
+; GFX950-NEXT:    s_waitcnt vmcnt(23)
+; GFX950-NEXT:    v_min_f64 v[56:57], v[4:5], v[44:45]
+; GFX950-NEXT:    v_cmp_u_f64_e64 s[2:3], v[4:5], v[44:45]
+; GFX950-NEXT:    scratch_load_dword v5, off, s32 offset:48
+; GFX950-NEXT:    scratch_load_dword v4, off, s32 offset:44
+; GFX950-NEXT:    s_waitcnt vmcnt(23)
+; GFX950-NEXT:    v_min_f64 v[44:45], v[2:3], v[42:43]
+; GFX950-NEXT:    v_cmp_u_f64_e64 s[4:5], v[2:3], v[42:43]
+; GFX950-NEXT:    scratch_load_dword v3, off, s32 offset:40
+; GFX950-NEXT:    scratch_load_dword v2, off, s32 offset:36
+; GFX950-NEXT:    s_waitcnt vmcnt(23)
+; GFX950-NEXT:    v_min_f64 v[42:43], v[0:1], v[52:53]
+; GFX950-NEXT:    v_cmp_u_f64_e64 s[6:7], v[0:1], v[52:53]
 ; GFX950-NEXT:    s_waitcnt vmcnt(19)
-; GFX950-NEXT:    v_min_f64 v[0:1], v[12:13], v[42:43]
-; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[12:13], v[42:43]
-; GFX950-NEXT:    v_cndmask_b32_e64 v4, v62, 0, s[2:3]
-; GFX950-NEXT:    v_cndmask_b32_e64 v5, v63, v2, s[2:3]
-; GFX950-NEXT:    v_cndmask_b32_e64 v12, v0, 0, vcc
-; GFX950-NEXT:    v_cndmask_b32_e32 v13, v1, v2, vcc
-; GFX950-NEXT:    s_waitcnt vmcnt(17)
-; GFX950-NEXT:    v_min_f64 v[0:1], v[14:15], v[40:41]
-; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[14:15], v[40:41]
-; GFX950-NEXT:    v_accvgpr_read_b32 v63, a15 ; Reload Reuse
-; GFX950-NEXT:    v_accvgpr_read_b32 v62, a14 ; Reload Reuse
-; GFX950-NEXT:    v_cndmask_b32_e64 v14, v0, 0, vcc
-; GFX950-NEXT:    v_cndmask_b32_e32 v15, v1, v2, vcc
-; GFX950-NEXT:    s_waitcnt vmcnt(15)
-; GFX950-NEXT:    v_min_f64 v[0:1], v[16:17], v[54:55]
-; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[16:17], v[54:55]
-; GFX950-NEXT:    v_accvgpr_read_b32 v61, a13 ; Reload Reuse
-; GFX950-NEXT:    v_accvgpr_read_b32 v57, a9 ; Reload Reuse
-; GFX950-NEXT:    v_cndmask_b32_e64 v16, v0, 0, vcc
-; GFX950-NEXT:    v_cndmask_b32_e32 v17, v1, v2, vcc
-; GFX950-NEXT:    s_waitcnt vmcnt(13)
-; GFX950-NEXT:    v_min_f64 v[0:1], v[18:19], v[52:53]
-; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[18:19], v[52:53]
-; GFX950-NEXT:    v_accvgpr_read_b32 v56, a8 ; Reload Reuse
+; GFX950-NEXT:    v_min_f64 v[0:1], v[30:31], v[54:55]
+; GFX950-NEXT:    s_waitcnt vmcnt(18)
+; GFX950-NEXT:    v_min_f64 v[52:53], v[26:27], v[48:49]
+; GFX950-NEXT:    v_cmp_u_f64_e64 s[8:9], v[30:31], v[54:55]
+; GFX950-NEXT:    v_cndmask_b32_e32 v29, v47, v60, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v28, v46, 0, vcc
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[26:27], v[48:49]
+; GFX950-NEXT:    v_cndmask_b32_e64 v31, v1, v60, s[8:9]
+; GFX950-NEXT:    v_cndmask_b32_e64 v30, v0, 0, s[8:9]
+; GFX950-NEXT:    v_cndmask_b32_e32 v27, v53, v60, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v26, v52, 0, vcc
+; GFX950-NEXT:    s_waitcnt vmcnt(16)
+; GFX950-NEXT:    v_min_f64 v[0:1], v[24:25], v[50:51]
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[24:25], v[50:51]
 ; GFX950-NEXT:    v_accvgpr_read_b32 v47, a7 ; Reload Reuse
-; GFX950-NEXT:    v_cndmask_b32_e64 v18, v0, 0, vcc
-; GFX950-NEXT:    v_cndmask_b32_e32 v19, v1, v2, vcc
-; GFX950-NEXT:    s_waitcnt vmcnt(11)
-; GFX950-NEXT:    v_min_f64 v[0:1], v[20:21], v[50:51]
-; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[20:21], v[50:51]
 ; GFX950-NEXT:    v_accvgpr_read_b32 v46, a6 ; Reload Reuse
-; GFX950-NEXT:    v_accvgpr_read_b32 v45, a5 ; Reload Reuse
-; GFX950-NEXT:    v_cndmask_b32_e64 v20, v0, 0, vcc
-; GFX950-NEXT:    v_cndmask_b32_e32 v21, v1, v2, vcc
-; GFX950-NEXT:    s_waitcnt vmcnt(9)
-; GFX950-NEXT:    v_min_f64 v[0:1], v[22:23], v[48:49]
-; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[22:23], v[48:49]
-; GFX950-NEXT:    v_accvgpr_read_b32 v44, a4 ; Reload Reuse
-; GFX950-NEXT:    v_accvgpr_read_b32 v43, a3 ; Reload Reuse
+; GFX950-NEXT:    v_cndmask_b32_e32 v25, v1, v60, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v24, v0, 0, vcc
+; GFX950-NEXT:    s_waitcnt vmcnt(14)
+; GFX950-NEXT:    v_min_f64 v[0:1], v[22:23], v[38:39]
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[22:23], v[38:39]
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v23, v1, v60, vcc
 ; GFX950-NEXT:    v_cndmask_b32_e64 v22, v0, 0, vcc
-; GFX950-NEXT:    v_cndmask_b32_e32 v23, v1, v2, vcc
+; GFX950-NEXT:    s_waitcnt vmcnt(12)
+; GFX950-NEXT:    v_min_f64 v[0:1], v[20:21], v[36:37]
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[20:21], v[36:37]
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v21, v1, v60, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v20, v0, 0, vcc
+; GFX950-NEXT:    s_waitcnt vmcnt(10)
+; GFX950-NEXT:    v_min_f64 v[0:1], v[18:19], v[34:35]
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[18:19], v[34:35]
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v19, v1, v60, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v18, v0, 0, vcc
+; GFX950-NEXT:    s_waitcnt vmcnt(8)
+; GFX950-NEXT:    v_min_f64 v[0:1], v[16:17], v[32:33]
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[16:17], v[32:33]
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v17, v1, v60, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v16, v0, 0, vcc
 ; GFX950-NEXT:    s_waitcnt vmcnt(6)
-; GFX950-NEXT:    v_min_f64 v[0:1], v[24:25], v[34:35]
-; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[24:25], v[34:35]
-; GFX950-NEXT:    v_accvgpr_read_b32 v42, a2 ; Reload Reuse
+; GFX950-NEXT:    v_min_f64 v[0:1], v[14:15], v[40:41]
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[14:15], v[40:41]
 ; GFX950-NEXT:    v_accvgpr_read_b32 v41, a1 ; Reload Reuse
-; GFX950-NEXT:    v_cndmask_b32_e64 v24, v0, 0, vcc
-; GFX950-NEXT:    v_cndmask_b32_e32 v25, v1, v2, vcc
 ; GFX950-NEXT:    v_accvgpr_read_b32 v40, a0 ; Reload Reuse
+; GFX950-NEXT:    v_cndmask_b32_e32 v15, v1, v60, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v14, v0, 0, vcc
 ; GFX950-NEXT:    s_waitcnt vmcnt(4)
-; GFX950-NEXT:    v_min_f64 v[0:1], v[26:27], v[32:33]
-; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[26:27], v[32:33]
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e64 v26, v0, 0, vcc
-; GFX950-NEXT:    v_cndmask_b32_e32 v27, v1, v2, vcc
+; GFX950-NEXT:    v_min_f64 v[0:1], v[12:13], v[6:7]
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[12:13], v[6:7]
+; GFX950-NEXT:    v_cndmask_b32_e64 v7, v59, v60, s[0:1]
+; GFX950-NEXT:    v_cndmask_b32_e64 v6, v58, 0, s[0:1]
+; GFX950-NEXT:    v_cndmask_b32_e32 v13, v1, v60, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v12, v0, 0, vcc
 ; GFX950-NEXT:    s_waitcnt vmcnt(2)
-; GFX950-NEXT:    v_min_f64 v[0:1], v[28:29], v[36:37]
-; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[28:29], v[36:37]
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e64 v28, v0, 0, vcc
-; GFX950-NEXT:    v_cndmask_b32_e32 v29, v1, v2, vcc
+; GFX950-NEXT:    v_min_f64 v[0:1], v[10:11], v[4:5]
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[10:11], v[4:5]
+; GFX950-NEXT:    v_cndmask_b32_e64 v5, v57, v60, s[2:3]
+; GFX950-NEXT:    v_cndmask_b32_e64 v4, v56, 0, s[2:3]
+; GFX950-NEXT:    v_cndmask_b32_e32 v11, v1, v60, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v10, v0, 0, vcc
 ; GFX950-NEXT:    s_waitcnt vmcnt(0)
-; GFX950-NEXT:    v_min_f64 v[0:1], v[30:31], v[38:39]
-; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[30:31], v[38:39]
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e64 v30, v0, 0, vcc
-; GFX950-NEXT:    v_cndmask_b32_e32 v31, v1, v2, vcc
-; GFX950-NEXT:    v_mov_b32_e32 v0, v58
-; GFX950-NEXT:    v_mov_b32_e32 v1, v59
-; GFX950-NEXT:    v_mov_b32_e32 v2, v60
+; GFX950-NEXT:    v_min_f64 v[0:1], v[8:9], v[2:3]
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[8:9], v[2:3]
+; GFX950-NEXT:    v_cndmask_b32_e64 v3, v45, v60, s[4:5]
+; GFX950-NEXT:    v_cndmask_b32_e64 v2, v44, 0, s[4:5]
+; GFX950-NEXT:    v_cndmask_b32_e32 v9, v1, v60, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v8, v0, 0, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v1, v43, v60, s[6:7]
+; GFX950-NEXT:    v_cndmask_b32_e64 v0, v42, 0, s[6:7]
 ; GFX950-NEXT:    v_accvgpr_read_b32 v60, a12 ; Reload Reuse
 ; GFX950-NEXT:    v_accvgpr_read_b32 v59, a11 ; Reload Reuse
 ; GFX950-NEXT:    v_accvgpr_read_b32 v58, a10 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_read_b32 v57, a9 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_read_b32 v56, a8 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_read_b32 v45, a5 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_read_b32 v44, a4 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_read_b32 v43, a3 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_read_b32 v42, a2 ; Reload Reuse
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimum_v16f64:
diff --git a/llvm/test/CodeGen/AMDGPU/masked-load-vectortypes.ll b/llvm/test/CodeGen/AMDGPU/masked-load-vectortypes.ll
index 3b855a56a5abb..88032317165bb 100644
--- a/llvm/test/CodeGen/AMDGPU/masked-load-vectortypes.ll
+++ b/llvm/test/CodeGen/AMDGPU/masked-load-vectortypes.ll
@@ -7,11 +7,11 @@ define <2 x i32> @uniform_masked_load_ptr1_mask_v2i32(ptr addrspace(1) inreg noc
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX942-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-NEXT:    v_mov_b32_e32 v1, v0
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], 0
 ; GFX942-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX942-NEXT:    s_cbranch_execz .LBB0_2
 ; GFX942-NEXT:  ; %bb.1: ; %cond.load
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    global_load_dwordx2 v[0:1], v0, s[0:1]
 ; GFX942-NEXT:  .LBB0_2:
 ; GFX942-NEXT:    s_or_b64 exec, exec, s[2:3]
@@ -30,13 +30,12 @@ define <4 x i32> @uniform_masked_load_ptr1_mask_v4i32(ptr addrspace(1) inreg noc
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX942-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-NEXT:    v_mov_b32_e32 v1, v0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v0
-; GFX942-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], 0
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
 ; GFX942-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX942-NEXT:    s_cbranch_execz .LBB1_2
 ; GFX942-NEXT:  ; %bb.1: ; %cond.load
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    global_load_dwordx4 v[0:3], v0, s[0:1]
 ; GFX942-NEXT:  .LBB1_2:
 ; GFX942-NEXT:    s_or_b64 exec, exec, s[2:3]
@@ -80,20 +79,16 @@ define <8 x i32> @uniform_masked_load_ptr1_mask_v8i32(ptr addrspace(1) inreg noc
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX942-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-NEXT:    v_mov_b32_e32 v1, v0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v0
-; GFX942-NEXT:    v_mov_b32_e32 v3, v0
-; GFX942-NEXT:    v_mov_b32_e32 v4, v0
-; GFX942-NEXT:    v_mov_b32_e32 v5, v0
-; GFX942-NEXT:    v_mov_b32_e32 v6, v0
-; GFX942-NEXT:    v_mov_b32_e32 v7, v0
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], 0
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[0:1]
+; GFX942-NEXT:    v_mov_b64_e32 v[6:7], v[0:1]
 ; GFX942-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX942-NEXT:    s_cbranch_execz .LBB3_2
 ; GFX942-NEXT:  ; %bb.1: ; %cond.load
-; GFX942-NEXT:    global_load_dwordx4 v[4:7], v0, s[0:1] offset:16
-; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    global_load_dwordx4 v[0:3], v0, s[0:1]
+; GFX942-NEXT:    v_mov_b32_e32 v8, 0
+; GFX942-NEXT:    global_load_dwordx4 v[4:7], v8, s[0:1] offset:16
+; GFX942-NEXT:    global_load_dwordx4 v[0:3], v8, s[0:1]
 ; GFX942-NEXT:  .LBB3_2:
 ; GFX942-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/maximumnum.ll b/llvm/test/CodeGen/AMDGPU/maximumnum.ll
index 4f73e8e9c1883..64c602f81cb23 100644
--- a/llvm/test/CodeGen/AMDGPU/maximumnum.ll
+++ b/llvm/test/CodeGen/AMDGPU/maximumnum.ll
@@ -5,11 +5,11 @@
 ; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %s | FileCheck -check-prefixes=GFX8,GFX8-SDAG %s
 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %s | FileCheck -check-prefixes=GFX8,GFX8-GISEL %s
 
-; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX900,GFX9-SDAG,GFX900-SDAG %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX900,GFX9-GISEL,GFX900-GISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX900,GFX900-SDAG %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX900,GFX900-GISEL %s
 
-; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 < %s | FileCheck -check-prefixes=GFX9,GFX950,GFX9-SDAG,GFX950-SDAG %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 < %s | FileCheck -check-prefixes=GFX9,GFX950,GFX9-GISEL,GFX950-GISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 < %s | FileCheck -check-prefixes=GFX950,GFX950-SDAG %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 < %s | FileCheck -check-prefixes=GFX950,GFX950-GISEL %s
 
 ; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck -check-prefixes=GFX10,GFX10-SDAG %s
 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck -check-prefixes=GFX10,GFX10-GISEL %s
@@ -62,21 +62,37 @@ define half @v_maximumnum_f16(half %x, half %y) {
 ; GFX8-GISEL-NEXT:    v_max_f16_e32 v0, v0, v1
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-SDAG-LABEL: v_maximumnum_f16:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX9-SDAG-NEXT:    v_max_f16_e32 v0, v0, v0
-; GFX9-SDAG-NEXT:    v_max_f16_e32 v0, v0, v1
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: v_maximumnum_f16:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_max_f16_e32 v0, v0, v0
-; GFX9-GISEL-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX9-GISEL-NEXT:    v_max_f16_e32 v0, v0, v1
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-SDAG-LABEL: v_maximumnum_f16:
+; GFX900-SDAG:       ; %bb.0:
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-SDAG-NEXT:    v_max_f16_e32 v1, v1, v1
+; GFX900-SDAG-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX900-SDAG-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-GISEL-LABEL: v_maximumnum_f16:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX900-GISEL-NEXT:    v_max_f16_e32 v1, v1, v1
+; GFX900-GISEL-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-SDAG-LABEL: v_maximumnum_f16:
+; GFX950-SDAG:       ; %bb.0:
+; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_max_f16_e32 v1, v1, v1
+; GFX950-SDAG-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX950-SDAG-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-GISEL-LABEL: v_maximumnum_f16:
+; GFX950-GISEL:       ; %bb.0:
+; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX950-GISEL-NEXT:    v_max_f16_e32 v1, v1, v1
+; GFX950-GISEL-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_maximumnum_f16:
 ; GFX10-SDAG:       ; %bb.0:
@@ -211,11 +227,17 @@ define half @v_maximumnum_f16_nnan(half %x, half %y) {
 ; GFX8-NEXT:    v_max_f16_e32 v0, v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_maximumnum_f16_nnan:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f16_e32 v0, v0, v1
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_maximumnum_f16_nnan:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_maximumnum_f16_nnan:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximumnum_f16_nnan:
 ; GFX10:       ; %bb.0:
@@ -283,12 +305,19 @@ define half @v_maximumnum_f16_1.0(half %x) {
 ; GFX8-NEXT:    v_max_f16_e32 v0, 1.0, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_maximumnum_f16_1.0:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f16_e32 v0, v0, v0
-; GFX9-NEXT:    v_max_f16_e32 v0, 1.0, v0
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_maximumnum_f16_1.0:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX900-NEXT:    v_max_f16_e32 v0, 1.0, v0
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_maximumnum_f16_1.0:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX950-NEXT:    v_max_f16_e32 v0, 1.0, v0
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximumnum_f16_1.0:
 ; GFX10:       ; %bb.0:
@@ -373,21 +402,37 @@ define float @v_maximumnum_f32(float %x, float %y) {
 ; GFX8-GISEL-NEXT:    v_max_f32_e32 v0, v0, v1
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-SDAG-LABEL: v_maximumnum_f32:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX9-SDAG-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX9-SDAG-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: v_maximumnum_f32:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX9-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX9-GISEL-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-SDAG-LABEL: v_maximumnum_f32:
+; GFX900-SDAG:       ; %bb.0:
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-SDAG-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX900-SDAG-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX900-SDAG-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-GISEL-LABEL: v_maximumnum_f32:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX900-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX900-GISEL-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-SDAG-LABEL: v_maximumnum_f32:
+; GFX950-SDAG:       ; %bb.0:
+; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX950-SDAG-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX950-SDAG-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-GISEL-LABEL: v_maximumnum_f32:
+; GFX950-GISEL:       ; %bb.0:
+; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX950-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX950-GISEL-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_maximumnum_f32:
 ; GFX10-SDAG:       ; %bb.0:
@@ -461,11 +506,17 @@ define float @v_maximumnum_f32_nnan(float %x, float %y) {
 ; GFX8-NEXT:    v_max_f32_e32 v0, v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_maximumnum_f32_nnan:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_maximumnum_f32_nnan:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_maximumnum_f32_nnan:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximumnum_f32_nnan:
 ; GFX10:       ; %bb.0:
@@ -525,21 +576,37 @@ define double @v_maximumnum_f64(double %x, double %y) {
 ; GFX8-GISEL-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-SDAG-LABEL: v_maximumnum_f64:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX9-SDAG-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
-; GFX9-SDAG-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: v_maximumnum_f64:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
-; GFX9-GISEL-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX9-GISEL-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-SDAG-LABEL: v_maximumnum_f64:
+; GFX900-SDAG:       ; %bb.0:
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-SDAG-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX900-SDAG-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX900-SDAG-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
+; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-GISEL-LABEL: v_maximumnum_f64:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX900-GISEL-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX900-GISEL-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-SDAG-LABEL: v_maximumnum_f64:
+; GFX950-SDAG:       ; %bb.0:
+; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX950-SDAG-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX950-SDAG-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
+; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-GISEL-LABEL: v_maximumnum_f64:
+; GFX950-GISEL:       ; %bb.0:
+; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX950-GISEL-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX950-GISEL-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
+; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_maximumnum_f64:
 ; GFX10-SDAG:       ; %bb.0:
@@ -617,11 +684,17 @@ define double @v_maximumnum_f64_nnan(double %x, double %y) {
 ; GFX8-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_maximumnum_f64_nnan:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_maximumnum_f64_nnan:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_maximumnum_f64_nnan:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximumnum_f64_nnan:
 ; GFX10:       ; %bb.0:
@@ -663,12 +736,19 @@ define float @v_maximumnum_f32_1.0(float %x) {
 ; GFX8-NEXT:    v_max_f32_e32 v0, 1.0, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_maximumnum_f32_1.0:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX9-NEXT:    v_max_f32_e32 v0, 1.0, v0
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_maximumnum_f32_1.0:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX900-NEXT:    v_max_f32_e32 v0, 1.0, v0
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_maximumnum_f32_1.0:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX950-NEXT:    v_max_f32_e32 v0, 1.0, v0
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximumnum_f32_1.0:
 ; GFX10:       ; %bb.0:
@@ -717,13 +797,21 @@ define float @v_maximumnum_f32_rhs_not_snan(float %x, float %y) {
 ; GFX8-NEXT:    v_max_f32_e32 v0, v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_maximumnum_f32_rhs_not_snan:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX9-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_maximumnum_f32_rhs_not_snan:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX900-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX900-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_maximumnum_f32_rhs_not_snan:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX950-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX950-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximumnum_f32_rhs_not_snan:
 ; GFX10:       ; %bb.0:
@@ -774,13 +862,21 @@ define float @v_maximumnum_f32_lhs_not_snan(float %x, float %y) {
 ; GFX8-NEXT:    v_max_f32_e32 v0, v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_maximumnum_f32_lhs_not_snan:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX9-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_maximumnum_f32_lhs_not_snan:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX900-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX900-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_maximumnum_f32_lhs_not_snan:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX950-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX950-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximumnum_f32_lhs_not_snan:
 ; GFX10:       ; %bb.0:
@@ -831,13 +927,21 @@ define float @v_maximumnum_f32_both_operands_not_snan(float %x, float %y) {
 ; GFX8-NEXT:    v_max_f32_e32 v0, v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_maximumnum_f32_both_operands_not_snan:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX9-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_maximumnum_f32_both_operands_not_snan:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX900-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX900-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_maximumnum_f32_both_operands_not_snan:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX950-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX950-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximumnum_f32_both_operands_not_snan:
 ; GFX10:       ; %bb.0:
@@ -887,12 +991,19 @@ define double @v_maximumnum_f64_1.0(double %x) {
 ; GFX8-NEXT:    v_max_f64 v[0:1], v[0:1], 1.0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_maximumnum_f64_1.0:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
-; GFX9-NEXT:    v_max_f64 v[0:1], v[0:1], 1.0
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_maximumnum_f64_1.0:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX900-NEXT:    v_max_f64 v[0:1], v[0:1], 1.0
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_maximumnum_f64_1.0:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX950-NEXT:    v_max_f64 v[0:1], v[0:1], 1.0
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximumnum_f64_1.0:
 ; GFX10:       ; %bb.0:
@@ -2190,21 +2301,37 @@ define float @v_maximumnum_f32_fabs_rhs(float %x, float %y) {
 ; GFX8-GISEL-NEXT:    v_max_f32_e32 v0, v0, v1
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-SDAG-LABEL: v_maximumnum_f32_fabs_rhs:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_max_f32_e64 v1, |v1|, |v1|
-; GFX9-SDAG-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX9-SDAG-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: v_maximumnum_f32_fabs_rhs:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX9-GISEL-NEXT:    v_max_f32_e64 v1, |v1|, |v1|
-; GFX9-GISEL-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-SDAG-LABEL: v_maximumnum_f32_fabs_rhs:
+; GFX900-SDAG:       ; %bb.0:
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-SDAG-NEXT:    v_max_f32_e64 v1, |v1|, |v1|
+; GFX900-SDAG-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX900-SDAG-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-GISEL-LABEL: v_maximumnum_f32_fabs_rhs:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX900-GISEL-NEXT:    v_max_f32_e64 v1, |v1|, |v1|
+; GFX900-GISEL-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-SDAG-LABEL: v_maximumnum_f32_fabs_rhs:
+; GFX950-SDAG:       ; %bb.0:
+; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_max_f32_e64 v1, |v1|, |v1|
+; GFX950-SDAG-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX950-SDAG-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-GISEL-LABEL: v_maximumnum_f32_fabs_rhs:
+; GFX950-GISEL:       ; %bb.0:
+; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX950-GISEL-NEXT:    v_max_f32_e64 v1, |v1|, |v1|
+; GFX950-GISEL-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_maximumnum_f32_fabs_rhs:
 ; GFX10-SDAG:       ; %bb.0:
@@ -2303,21 +2430,37 @@ define float @v_maximumnum_f32_fneg_fabs_rhs(float %x, float %y) {
 ; GFX8-GISEL-NEXT:    v_max_f32_e32 v0, v0, v1
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-SDAG-LABEL: v_maximumnum_f32_fneg_fabs_rhs:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_max_f32_e64 v1, -|v1|, -|v1|
-; GFX9-SDAG-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX9-SDAG-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: v_maximumnum_f32_fneg_fabs_rhs:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX9-GISEL-NEXT:    v_max_f32_e64 v1, -|v1|, -|v1|
-; GFX9-GISEL-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-SDAG-LABEL: v_maximumnum_f32_fneg_fabs_rhs:
+; GFX900-SDAG:       ; %bb.0:
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-SDAG-NEXT:    v_max_f32_e64 v1, -|v1|, -|v1|
+; GFX900-SDAG-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX900-SDAG-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-GISEL-LABEL: v_maximumnum_f32_fneg_fabs_rhs:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX900-GISEL-NEXT:    v_max_f32_e64 v1, -|v1|, -|v1|
+; GFX900-GISEL-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-SDAG-LABEL: v_maximumnum_f32_fneg_fabs_rhs:
+; GFX950-SDAG:       ; %bb.0:
+; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_max_f32_e64 v1, -|v1|, -|v1|
+; GFX950-SDAG-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX950-SDAG-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-GISEL-LABEL: v_maximumnum_f32_fneg_fabs_rhs:
+; GFX950-GISEL:       ; %bb.0:
+; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX950-GISEL-NEXT:    v_max_f32_e64 v1, -|v1|, -|v1|
+; GFX950-GISEL-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_maximumnum_f32_fneg_fabs_rhs:
 ; GFX10-SDAG:       ; %bb.0:
@@ -2417,21 +2560,37 @@ define float @v_maximumnum_f32_fabs(float %x, float %y) {
 ; GFX8-GISEL-NEXT:    v_max_f32_e32 v0, v0, v1
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-SDAG-LABEL: v_maximumnum_f32_fabs:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_max_f32_e64 v1, |v1|, |v1|
-; GFX9-SDAG-NEXT:    v_max_f32_e64 v0, |v0|, |v0|
-; GFX9-SDAG-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: v_maximumnum_f32_fabs:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_max_f32_e64 v0, |v0|, |v0|
-; GFX9-GISEL-NEXT:    v_max_f32_e64 v1, |v1|, |v1|
-; GFX9-GISEL-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-SDAG-LABEL: v_maximumnum_f32_fabs:
+; GFX900-SDAG:       ; %bb.0:
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-SDAG-NEXT:    v_max_f32_e64 v1, |v1|, |v1|
+; GFX900-SDAG-NEXT:    v_max_f32_e64 v0, |v0|, |v0|
+; GFX900-SDAG-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-GISEL-LABEL: v_maximumnum_f32_fabs:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_max_f32_e64 v0, |v0|, |v0|
+; GFX900-GISEL-NEXT:    v_max_f32_e64 v1, |v1|, |v1|
+; GFX900-GISEL-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-SDAG-LABEL: v_maximumnum_f32_fabs:
+; GFX950-SDAG:       ; %bb.0:
+; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_max_f32_e64 v1, |v1|, |v1|
+; GFX950-SDAG-NEXT:    v_max_f32_e64 v0, |v0|, |v0|
+; GFX950-SDAG-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-GISEL-LABEL: v_maximumnum_f32_fabs:
+; GFX950-GISEL:       ; %bb.0:
+; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_max_f32_e64 v0, |v0|, |v0|
+; GFX950-GISEL-NEXT:    v_max_f32_e64 v1, |v1|, |v1|
+; GFX950-GISEL-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_maximumnum_f32_fabs:
 ; GFX10-SDAG:       ; %bb.0:
@@ -2531,21 +2690,37 @@ define float @v_maximumnum_f32_fneg(float %x, float %y) {
 ; GFX8-GISEL-NEXT:    v_max_f32_e32 v0, v0, v1
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-SDAG-LABEL: v_maximumnum_f32_fneg:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_max_f32_e64 v1, -v1, -v1
-; GFX9-SDAG-NEXT:    v_max_f32_e64 v0, -v0, -v0
-; GFX9-SDAG-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: v_maximumnum_f32_fneg:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_max_f32_e64 v0, -v0, -v0
-; GFX9-GISEL-NEXT:    v_max_f32_e64 v1, -v1, -v1
-; GFX9-GISEL-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-SDAG-LABEL: v_maximumnum_f32_fneg:
+; GFX900-SDAG:       ; %bb.0:
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-SDAG-NEXT:    v_max_f32_e64 v1, -v1, -v1
+; GFX900-SDAG-NEXT:    v_max_f32_e64 v0, -v0, -v0
+; GFX900-SDAG-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-GISEL-LABEL: v_maximumnum_f32_fneg:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_max_f32_e64 v0, -v0, -v0
+; GFX900-GISEL-NEXT:    v_max_f32_e64 v1, -v1, -v1
+; GFX900-GISEL-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-SDAG-LABEL: v_maximumnum_f32_fneg:
+; GFX950-SDAG:       ; %bb.0:
+; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_max_f32_e64 v1, -v1, -v1
+; GFX950-SDAG-NEXT:    v_max_f32_e64 v0, -v0, -v0
+; GFX950-SDAG-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-GISEL-LABEL: v_maximumnum_f32_fneg:
+; GFX950-GISEL:       ; %bb.0:
+; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_max_f32_e64 v0, -v0, -v0
+; GFX950-GISEL-NEXT:    v_max_f32_e64 v1, -v1, -v1
+; GFX950-GISEL-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_maximumnum_f32_fneg:
 ; GFX10-SDAG:       ; %bb.0:
@@ -2648,21 +2823,37 @@ define half @v_maximumnum_f16_fabs_rhs(half %x, half %y) {
 ; GFX8-GISEL-NEXT:    v_max_f16_e32 v0, v0, v1
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-SDAG-LABEL: v_maximumnum_f16_fabs_rhs:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_max_f16_e64 v1, |v1|, |v1|
-; GFX9-SDAG-NEXT:    v_max_f16_e32 v0, v0, v0
-; GFX9-SDAG-NEXT:    v_max_f16_e32 v0, v0, v1
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: v_maximumnum_f16_fabs_rhs:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_max_f16_e32 v0, v0, v0
-; GFX9-GISEL-NEXT:    v_max_f16_e64 v1, |v1|, |v1|
-; GFX9-GISEL-NEXT:    v_max_f16_e32 v0, v0, v1
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-SDAG-LABEL: v_maximumnum_f16_fabs_rhs:
+; GFX900-SDAG:       ; %bb.0:
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-SDAG-NEXT:    v_max_f16_e64 v1, |v1|, |v1|
+; GFX900-SDAG-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX900-SDAG-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-GISEL-LABEL: v_maximumnum_f16_fabs_rhs:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX900-GISEL-NEXT:    v_max_f16_e64 v1, |v1|, |v1|
+; GFX900-GISEL-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-SDAG-LABEL: v_maximumnum_f16_fabs_rhs:
+; GFX950-SDAG:       ; %bb.0:
+; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_max_f16_e64 v1, |v1|, |v1|
+; GFX950-SDAG-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX950-SDAG-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-GISEL-LABEL: v_maximumnum_f16_fabs_rhs:
+; GFX950-GISEL:       ; %bb.0:
+; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX950-GISEL-NEXT:    v_max_f16_e64 v1, |v1|, |v1|
+; GFX950-GISEL-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_maximumnum_f16_fabs_rhs:
 ; GFX10-SDAG:       ; %bb.0:
@@ -2808,21 +2999,37 @@ define half @v_maximumnum_f16_fneg_fabs_rhs(half %x, half %y) {
 ; GFX8-GISEL-NEXT:    v_max_f16_e32 v0, v0, v1
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-SDAG-LABEL: v_maximumnum_f16_fneg_fabs_rhs:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_max_f16_e64 v1, -|v1|, -|v1|
-; GFX9-SDAG-NEXT:    v_max_f16_e32 v0, v0, v0
-; GFX9-SDAG-NEXT:    v_max_f16_e32 v0, v0, v1
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: v_maximumnum_f16_fneg_fabs_rhs:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_max_f16_e32 v0, v0, v0
-; GFX9-GISEL-NEXT:    v_max_f16_e64 v1, -|v1|, -|v1|
-; GFX9-GISEL-NEXT:    v_max_f16_e32 v0, v0, v1
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-SDAG-LABEL: v_maximumnum_f16_fneg_fabs_rhs:
+; GFX900-SDAG:       ; %bb.0:
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-SDAG-NEXT:    v_max_f16_e64 v1, -|v1|, -|v1|
+; GFX900-SDAG-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX900-SDAG-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-GISEL-LABEL: v_maximumnum_f16_fneg_fabs_rhs:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX900-GISEL-NEXT:    v_max_f16_e64 v1, -|v1|, -|v1|
+; GFX900-GISEL-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-SDAG-LABEL: v_maximumnum_f16_fneg_fabs_rhs:
+; GFX950-SDAG:       ; %bb.0:
+; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_max_f16_e64 v1, -|v1|, -|v1|
+; GFX950-SDAG-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX950-SDAG-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-GISEL-LABEL: v_maximumnum_f16_fneg_fabs_rhs:
+; GFX950-GISEL:       ; %bb.0:
+; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX950-GISEL-NEXT:    v_max_f16_e64 v1, -|v1|, -|v1|
+; GFX950-GISEL-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_maximumnum_f16_fneg_fabs_rhs:
 ; GFX10-SDAG:       ; %bb.0:
@@ -2969,21 +3176,37 @@ define half @v_maximumnum_f16_fabs(half %x, half %y) {
 ; GFX8-GISEL-NEXT:    v_max_f16_e32 v0, v0, v1
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-SDAG-LABEL: v_maximumnum_f16_fabs:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_max_f16_e64 v1, |v1|, |v1|
-; GFX9-SDAG-NEXT:    v_max_f16_e64 v0, |v0|, |v0|
-; GFX9-SDAG-NEXT:    v_max_f16_e32 v0, v0, v1
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: v_maximumnum_f16_fabs:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_max_f16_e64 v0, |v0|, |v0|
-; GFX9-GISEL-NEXT:    v_max_f16_e64 v1, |v1|, |v1|
-; GFX9-GISEL-NEXT:    v_max_f16_e32 v0, v0, v1
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-SDAG-LABEL: v_maximumnum_f16_fabs:
+; GFX900-SDAG:       ; %bb.0:
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-SDAG-NEXT:    v_max_f16_e64 v1, |v1|, |v1|
+; GFX900-SDAG-NEXT:    v_max_f16_e64 v0, |v0|, |v0|
+; GFX900-SDAG-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-GISEL-LABEL: v_maximumnum_f16_fabs:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_max_f16_e64 v0, |v0|, |v0|
+; GFX900-GISEL-NEXT:    v_max_f16_e64 v1, |v1|, |v1|
+; GFX900-GISEL-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-SDAG-LABEL: v_maximumnum_f16_fabs:
+; GFX950-SDAG:       ; %bb.0:
+; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_max_f16_e64 v1, |v1|, |v1|
+; GFX950-SDAG-NEXT:    v_max_f16_e64 v0, |v0|, |v0|
+; GFX950-SDAG-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-GISEL-LABEL: v_maximumnum_f16_fabs:
+; GFX950-GISEL:       ; %bb.0:
+; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_max_f16_e64 v0, |v0|, |v0|
+; GFX950-GISEL-NEXT:    v_max_f16_e64 v1, |v1|, |v1|
+; GFX950-GISEL-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_maximumnum_f16_fabs:
 ; GFX10-SDAG:       ; %bb.0:
@@ -3130,21 +3353,37 @@ define half @v_maximumnum_f16_fneg(half %x, half %y) {
 ; GFX8-GISEL-NEXT:    v_max_f16_e32 v0, v0, v1
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-SDAG-LABEL: v_maximumnum_f16_fneg:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_max_f16_e64 v1, -v1, -v1
-; GFX9-SDAG-NEXT:    v_max_f16_e64 v0, -v0, -v0
-; GFX9-SDAG-NEXT:    v_max_f16_e32 v0, v0, v1
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: v_maximumnum_f16_fneg:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_max_f16_e64 v0, -v0, -v0
-; GFX9-GISEL-NEXT:    v_max_f16_e64 v1, -v1, -v1
-; GFX9-GISEL-NEXT:    v_max_f16_e32 v0, v0, v1
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-SDAG-LABEL: v_maximumnum_f16_fneg:
+; GFX900-SDAG:       ; %bb.0:
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-SDAG-NEXT:    v_max_f16_e64 v1, -v1, -v1
+; GFX900-SDAG-NEXT:    v_max_f16_e64 v0, -v0, -v0
+; GFX900-SDAG-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-GISEL-LABEL: v_maximumnum_f16_fneg:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_max_f16_e64 v0, -v0, -v0
+; GFX900-GISEL-NEXT:    v_max_f16_e64 v1, -v1, -v1
+; GFX900-GISEL-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-SDAG-LABEL: v_maximumnum_f16_fneg:
+; GFX950-SDAG:       ; %bb.0:
+; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_max_f16_e64 v1, -v1, -v1
+; GFX950-SDAG-NEXT:    v_max_f16_e64 v0, -v0, -v0
+; GFX950-SDAG-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-GISEL-LABEL: v_maximumnum_f16_fneg:
+; GFX950-GISEL:       ; %bb.0:
+; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_max_f16_e64 v0, -v0, -v0
+; GFX950-GISEL-NEXT:    v_max_f16_e64 v1, -v1, -v1
+; GFX950-GISEL-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_maximumnum_f16_fneg:
 ; GFX10-SDAG:       ; %bb.0:
@@ -3288,21 +3527,37 @@ define double @v_maximumnum_f64_fneg(double %x, double %y) {
 ; GFX8-GISEL-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-SDAG-LABEL: v_maximumnum_f64_fneg:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_max_f64 v[2:3], -v[2:3], -v[2:3]
-; GFX9-SDAG-NEXT:    v_max_f64 v[0:1], -v[0:1], -v[0:1]
-; GFX9-SDAG-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: v_maximumnum_f64_fneg:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_max_f64 v[0:1], -v[0:1], -v[0:1]
-; GFX9-GISEL-NEXT:    v_max_f64 v[2:3], -v[2:3], -v[2:3]
-; GFX9-GISEL-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-SDAG-LABEL: v_maximumnum_f64_fneg:
+; GFX900-SDAG:       ; %bb.0:
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-SDAG-NEXT:    v_max_f64 v[2:3], -v[2:3], -v[2:3]
+; GFX900-SDAG-NEXT:    v_max_f64 v[0:1], -v[0:1], -v[0:1]
+; GFX900-SDAG-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
+; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-GISEL-LABEL: v_maximumnum_f64_fneg:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_max_f64 v[0:1], -v[0:1], -v[0:1]
+; GFX900-GISEL-NEXT:    v_max_f64 v[2:3], -v[2:3], -v[2:3]
+; GFX900-GISEL-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-SDAG-LABEL: v_maximumnum_f64_fneg:
+; GFX950-SDAG:       ; %bb.0:
+; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_max_f64 v[2:3], -v[2:3], -v[2:3]
+; GFX950-SDAG-NEXT:    v_max_f64 v[0:1], -v[0:1], -v[0:1]
+; GFX950-SDAG-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
+; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-GISEL-LABEL: v_maximumnum_f64_fneg:
+; GFX950-GISEL:       ; %bb.0:
+; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_max_f64 v[0:1], -v[0:1], -v[0:1]
+; GFX950-GISEL-NEXT:    v_max_f64 v[2:3], -v[2:3], -v[2:3]
+; GFX950-GISEL-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
+; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_maximumnum_f64_fneg:
 ; GFX10-SDAG:       ; %bb.0:
@@ -3564,11 +3819,17 @@ define <2 x half> @v_maximumnum_v2f16_nnan(<2 x half> %x, <2 x half> %y) {
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v2, v0
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_maximumnum_v2f16_nnan:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_pk_max_f16 v0, v0, v1
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_maximumnum_v2f16_nnan:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_pk_max_f16 v0, v0, v1
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_maximumnum_v2f16_nnan:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_pk_max_f16 v0, v0, v1
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximumnum_v2f16_nnan:
 ; GFX10:       ; %bb.0:
@@ -3663,16 +3924,16 @@ define <3 x half> @v_maximumnum_v3f16(<3 x half> %x, <3 x half> %y) {
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v4, v0
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-SDAG-LABEL: v_maximumnum_v3f16:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_pk_max_f16 v3, v3, v3
-; GFX9-SDAG-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX9-SDAG-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX9-SDAG-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX9-SDAG-NEXT:    v_pk_max_f16 v1, v1, v3
-; GFX9-SDAG-NEXT:    v_pk_max_f16 v0, v0, v2
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-SDAG-LABEL: v_maximumnum_v3f16:
+; GFX900-SDAG:       ; %bb.0:
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-SDAG-NEXT:    v_pk_max_f16 v3, v3, v3
+; GFX900-SDAG-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX900-SDAG-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX900-SDAG-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX900-SDAG-NEXT:    v_pk_max_f16 v1, v1, v3
+; GFX900-SDAG-NEXT:    v_pk_max_f16 v0, v0, v2
+; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_maximumnum_v3f16:
 ; GFX900-GISEL:       ; %bb.0:
@@ -3685,6 +3946,17 @@ define <3 x half> @v_maximumnum_v3f16(<3 x half> %x, <3 x half> %y) {
 ; GFX900-GISEL-NEXT:    v_pk_max_f16 v1, v1, v2
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX950-SDAG-LABEL: v_maximumnum_v3f16:
+; GFX950-SDAG:       ; %bb.0:
+; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_pk_max_f16 v3, v3, v3
+; GFX950-SDAG-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX950-SDAG-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX950-SDAG-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX950-SDAG-NEXT:    v_pk_max_f16 v1, v1, v3
+; GFX950-SDAG-NEXT:    v_pk_max_f16 v0, v0, v2
+; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX950-GISEL-LABEL: v_maximumnum_v3f16:
 ; GFX950-GISEL:       ; %bb.0:
 ; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3834,19 +4106,33 @@ define <3 x half> @v_maximumnum_v3f16_nnan(<3 x half> %x, <3 x half> %y) {
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v4, v0
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-SDAG-LABEL: v_maximumnum_v3f16_nnan:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_pk_max_f16 v1, v1, v3
-; GFX9-SDAG-NEXT:    v_pk_max_f16 v0, v0, v2
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-SDAG-LABEL: v_maximumnum_v3f16_nnan:
+; GFX900-SDAG:       ; %bb.0:
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-SDAG-NEXT:    v_pk_max_f16 v1, v1, v3
+; GFX900-SDAG-NEXT:    v_pk_max_f16 v0, v0, v2
+; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-GISEL-LABEL: v_maximumnum_v3f16_nnan:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v0, v0, v2
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v1, v1, v3
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-SDAG-LABEL: v_maximumnum_v3f16_nnan:
+; GFX950-SDAG:       ; %bb.0:
+; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_pk_max_f16 v1, v1, v3
+; GFX950-SDAG-NEXT:    v_pk_max_f16 v0, v0, v2
+; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-GISEL-LABEL: v_maximumnum_v3f16_nnan:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v0, v0, v2
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v1, v1, v3
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX950-GISEL-LABEL: v_maximumnum_v3f16_nnan:
+; GFX950-GISEL:       ; %bb.0:
+; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v0, v0, v2
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v1, v1, v3
+; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximumnum_v3f16_nnan:
 ; GFX10:       ; %bb.0:
@@ -4157,12 +4443,19 @@ define <4 x half> @v_maximumnum_v4f16_nnan(<4 x half> %x, <4 x half> %y) {
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v1, v2, v1
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_maximumnum_v4f16_nnan:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_pk_max_f16 v0, v0, v2
-; GFX9-NEXT:    v_pk_max_f16 v1, v1, v3
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_maximumnum_v4f16_nnan:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_pk_max_f16 v0, v0, v2
+; GFX900-NEXT:    v_pk_max_f16 v1, v1, v3
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_maximumnum_v4f16_nnan:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_pk_max_f16 v0, v0, v2
+; GFX950-NEXT:    v_pk_max_f16 v1, v1, v3
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximumnum_v4f16_nnan:
 ; GFX10:       ; %bb.0:
@@ -6691,27 +6984,49 @@ define <2 x float> @v_maximumnum_v2f32(<2 x float> %x, <2 x float> %y) {
 ; GFX8-GISEL-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-SDAG-LABEL: v_maximumnum_v2f32:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX9-SDAG-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX9-SDAG-NEXT:    v_max_f32_e32 v0, v0, v2
-; GFX9-SDAG-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX9-SDAG-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX9-SDAG-NEXT:    v_max_f32_e32 v1, v1, v2
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: v_maximumnum_v2f32:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX9-GISEL-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX9-GISEL-NEXT:    v_max_f32_e32 v0, v0, v2
-; GFX9-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX9-GISEL-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX9-GISEL-NEXT:    v_max_f32_e32 v1, v1, v2
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-SDAG-LABEL: v_maximumnum_v2f32:
+; GFX900-SDAG:       ; %bb.0:
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-SDAG-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX900-SDAG-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX900-SDAG-NEXT:    v_max_f32_e32 v0, v0, v2
+; GFX900-SDAG-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX900-SDAG-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX900-SDAG-NEXT:    v_max_f32_e32 v1, v1, v2
+; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-GISEL-LABEL: v_maximumnum_v2f32:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX900-GISEL-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX900-GISEL-NEXT:    v_max_f32_e32 v0, v0, v2
+; GFX900-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX900-GISEL-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX900-GISEL-NEXT:    v_max_f32_e32 v1, v1, v2
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-SDAG-LABEL: v_maximumnum_v2f32:
+; GFX950-SDAG:       ; %bb.0:
+; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX950-SDAG-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX950-SDAG-NEXT:    v_max_f32_e32 v0, v0, v2
+; GFX950-SDAG-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX950-SDAG-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX950-SDAG-NEXT:    v_max_f32_e32 v1, v1, v2
+; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-GISEL-LABEL: v_maximumnum_v2f32:
+; GFX950-GISEL:       ; %bb.0:
+; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX950-GISEL-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX950-GISEL-NEXT:    v_max_f32_e32 v0, v0, v2
+; GFX950-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX950-GISEL-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX950-GISEL-NEXT:    v_max_f32_e32 v1, v1, v2
+; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_maximumnum_v2f32:
 ; GFX10-SDAG:       ; %bb.0:
@@ -6797,12 +7112,19 @@ define <2 x float> @v_maximumnum_v2f32_nnan(<2 x float> %x, <2 x float> %y) {
 ; GFX8-NEXT:    v_max_f32_e32 v1, v1, v3
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_maximumnum_v2f32_nnan:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f32_e32 v0, v0, v2
-; GFX9-NEXT:    v_max_f32_e32 v1, v1, v3
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_maximumnum_v2f32_nnan:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_max_f32_e32 v0, v0, v2
+; GFX900-NEXT:    v_max_f32_e32 v1, v1, v3
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_maximumnum_v2f32_nnan:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_max_f32_e32 v0, v0, v2
+; GFX950-NEXT:    v_max_f32_e32 v1, v1, v3
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximumnum_v2f32_nnan:
 ; GFX10:       ; %bb.0:
@@ -6887,33 +7209,61 @@ define <3 x float> @v_maximumnum_v3f32(<3 x float> %x, <3 x float> %y) {
 ; GFX8-GISEL-NEXT:    v_max_f32_e32 v2, v2, v3
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-SDAG-LABEL: v_maximumnum_v3f32:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_max_f32_e32 v3, v3, v3
-; GFX9-SDAG-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX9-SDAG-NEXT:    v_max_f32_e32 v0, v0, v3
-; GFX9-SDAG-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX9-SDAG-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX9-SDAG-NEXT:    v_max_f32_e32 v1, v1, v3
-; GFX9-SDAG-NEXT:    v_max_f32_e32 v3, v5, v5
-; GFX9-SDAG-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX9-SDAG-NEXT:    v_max_f32_e32 v2, v2, v3
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: v_maximumnum_v3f32:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX9-GISEL-NEXT:    v_max_f32_e32 v3, v3, v3
-; GFX9-GISEL-NEXT:    v_max_f32_e32 v0, v0, v3
-; GFX9-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX9-GISEL-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX9-GISEL-NEXT:    v_max_f32_e32 v1, v1, v3
-; GFX9-GISEL-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX9-GISEL-NEXT:    v_max_f32_e32 v3, v5, v5
-; GFX9-GISEL-NEXT:    v_max_f32_e32 v2, v2, v3
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-SDAG-LABEL: v_maximumnum_v3f32:
+; GFX900-SDAG:       ; %bb.0:
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-SDAG-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX900-SDAG-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX900-SDAG-NEXT:    v_max_f32_e32 v0, v0, v3
+; GFX900-SDAG-NEXT:    v_max_f32_e32 v3, v4, v4
+; GFX900-SDAG-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX900-SDAG-NEXT:    v_max_f32_e32 v1, v1, v3
+; GFX900-SDAG-NEXT:    v_max_f32_e32 v3, v5, v5
+; GFX900-SDAG-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX900-SDAG-NEXT:    v_max_f32_e32 v2, v2, v3
+; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-GISEL-LABEL: v_maximumnum_v3f32:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX900-GISEL-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX900-GISEL-NEXT:    v_max_f32_e32 v0, v0, v3
+; GFX900-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX900-GISEL-NEXT:    v_max_f32_e32 v3, v4, v4
+; GFX900-GISEL-NEXT:    v_max_f32_e32 v1, v1, v3
+; GFX900-GISEL-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX900-GISEL-NEXT:    v_max_f32_e32 v3, v5, v5
+; GFX900-GISEL-NEXT:    v_max_f32_e32 v2, v2, v3
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-SDAG-LABEL: v_maximumnum_v3f32:
+; GFX950-SDAG:       ; %bb.0:
+; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX950-SDAG-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX950-SDAG-NEXT:    v_max_f32_e32 v0, v0, v3
+; GFX950-SDAG-NEXT:    v_max_f32_e32 v3, v4, v4
+; GFX950-SDAG-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX950-SDAG-NEXT:    v_max_f32_e32 v1, v1, v3
+; GFX950-SDAG-NEXT:    v_max_f32_e32 v3, v5, v5
+; GFX950-SDAG-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX950-SDAG-NEXT:    v_max_f32_e32 v2, v2, v3
+; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-GISEL-LABEL: v_maximumnum_v3f32:
+; GFX950-GISEL:       ; %bb.0:
+; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX950-GISEL-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX950-GISEL-NEXT:    v_max_f32_e32 v0, v0, v3
+; GFX950-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX950-GISEL-NEXT:    v_max_f32_e32 v3, v4, v4
+; GFX950-GISEL-NEXT:    v_max_f32_e32 v1, v1, v3
+; GFX950-GISEL-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX950-GISEL-NEXT:    v_max_f32_e32 v3, v5, v5
+; GFX950-GISEL-NEXT:    v_max_f32_e32 v2, v2, v3
+; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_maximumnum_v3f32:
 ; GFX10-SDAG:       ; %bb.0:
@@ -7015,13 +7365,21 @@ define <3 x float> @v_maximumnum_v3f32_nnan(<3 x float> %x, <3 x float> %y) {
 ; GFX8-NEXT:    v_max_f32_e32 v2, v2, v5
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_maximumnum_v3f32_nnan:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f32_e32 v0, v0, v3
-; GFX9-NEXT:    v_max_f32_e32 v1, v1, v4
-; GFX9-NEXT:    v_max_f32_e32 v2, v2, v5
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_maximumnum_v3f32_nnan:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_max_f32_e32 v0, v0, v3
+; GFX900-NEXT:    v_max_f32_e32 v1, v1, v4
+; GFX900-NEXT:    v_max_f32_e32 v2, v2, v5
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_maximumnum_v3f32_nnan:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_max_f32_e32 v0, v0, v3
+; GFX950-NEXT:    v_max_f32_e32 v1, v1, v4
+; GFX950-NEXT:    v_max_f32_e32 v2, v2, v5
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximumnum_v3f32_nnan:
 ; GFX10:       ; %bb.0:
@@ -7121,39 +7479,73 @@ define <4 x float> @v_maximumnum_v4f32(<4 x float> %x, <4 x float> %y) {
 ; GFX8-GISEL-NEXT:    v_max_f32_e32 v3, v3, v4
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-SDAG-LABEL: v_maximumnum_v4f32:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_max_f32_e32 v4, v4, v4
-; GFX9-SDAG-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX9-SDAG-NEXT:    v_max_f32_e32 v0, v0, v4
-; GFX9-SDAG-NEXT:    v_max_f32_e32 v4, v5, v5
-; GFX9-SDAG-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX9-SDAG-NEXT:    v_max_f32_e32 v1, v1, v4
-; GFX9-SDAG-NEXT:    v_max_f32_e32 v4, v6, v6
-; GFX9-SDAG-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX9-SDAG-NEXT:    v_max_f32_e32 v2, v2, v4
-; GFX9-SDAG-NEXT:    v_max_f32_e32 v4, v7, v7
-; GFX9-SDAG-NEXT:    v_max_f32_e32 v3, v3, v3
-; GFX9-SDAG-NEXT:    v_max_f32_e32 v3, v3, v4
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: v_maximumnum_v4f32:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX9-GISEL-NEXT:    v_max_f32_e32 v4, v4, v4
-; GFX9-GISEL-NEXT:    v_max_f32_e32 v0, v0, v4
-; GFX9-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX9-GISEL-NEXT:    v_max_f32_e32 v4, v5, v5
-; GFX9-GISEL-NEXT:    v_max_f32_e32 v1, v1, v4
-; GFX9-GISEL-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX9-GISEL-NEXT:    v_max_f32_e32 v4, v6, v6
-; GFX9-GISEL-NEXT:    v_max_f32_e32 v2, v2, v4
-; GFX9-GISEL-NEXT:    v_max_f32_e32 v3, v3, v3
-; GFX9-GISEL-NEXT:    v_max_f32_e32 v4, v7, v7
-; GFX9-GISEL-NEXT:    v_max_f32_e32 v3, v3, v4
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-SDAG-LABEL: v_maximumnum_v4f32:
+; GFX900-SDAG:       ; %bb.0:
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-SDAG-NEXT:    v_max_f32_e32 v4, v4, v4
+; GFX900-SDAG-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX900-SDAG-NEXT:    v_max_f32_e32 v0, v0, v4
+; GFX900-SDAG-NEXT:    v_max_f32_e32 v4, v5, v5
+; GFX900-SDAG-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX900-SDAG-NEXT:    v_max_f32_e32 v1, v1, v4
+; GFX900-SDAG-NEXT:    v_max_f32_e32 v4, v6, v6
+; GFX900-SDAG-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX900-SDAG-NEXT:    v_max_f32_e32 v2, v2, v4
+; GFX900-SDAG-NEXT:    v_max_f32_e32 v4, v7, v7
+; GFX900-SDAG-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX900-SDAG-NEXT:    v_max_f32_e32 v3, v3, v4
+; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-GISEL-LABEL: v_maximumnum_v4f32:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX900-GISEL-NEXT:    v_max_f32_e32 v4, v4, v4
+; GFX900-GISEL-NEXT:    v_max_f32_e32 v0, v0, v4
+; GFX900-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX900-GISEL-NEXT:    v_max_f32_e32 v4, v5, v5
+; GFX900-GISEL-NEXT:    v_max_f32_e32 v1, v1, v4
+; GFX900-GISEL-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX900-GISEL-NEXT:    v_max_f32_e32 v4, v6, v6
+; GFX900-GISEL-NEXT:    v_max_f32_e32 v2, v2, v4
+; GFX900-GISEL-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX900-GISEL-NEXT:    v_max_f32_e32 v4, v7, v7
+; GFX900-GISEL-NEXT:    v_max_f32_e32 v3, v3, v4
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-SDAG-LABEL: v_maximumnum_v4f32:
+; GFX950-SDAG:       ; %bb.0:
+; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_max_f32_e32 v4, v4, v4
+; GFX950-SDAG-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX950-SDAG-NEXT:    v_max_f32_e32 v0, v0, v4
+; GFX950-SDAG-NEXT:    v_max_f32_e32 v4, v5, v5
+; GFX950-SDAG-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX950-SDAG-NEXT:    v_max_f32_e32 v1, v1, v4
+; GFX950-SDAG-NEXT:    v_max_f32_e32 v4, v6, v6
+; GFX950-SDAG-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX950-SDAG-NEXT:    v_max_f32_e32 v2, v2, v4
+; GFX950-SDAG-NEXT:    v_max_f32_e32 v4, v7, v7
+; GFX950-SDAG-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX950-SDAG-NEXT:    v_max_f32_e32 v3, v3, v4
+; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-GISEL-LABEL: v_maximumnum_v4f32:
+; GFX950-GISEL:       ; %bb.0:
+; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX950-GISEL-NEXT:    v_max_f32_e32 v4, v4, v4
+; GFX950-GISEL-NEXT:    v_max_f32_e32 v0, v0, v4
+; GFX950-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX950-GISEL-NEXT:    v_max_f32_e32 v4, v5, v5
+; GFX950-GISEL-NEXT:    v_max_f32_e32 v1, v1, v4
+; GFX950-GISEL-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX950-GISEL-NEXT:    v_max_f32_e32 v4, v6, v6
+; GFX950-GISEL-NEXT:    v_max_f32_e32 v2, v2, v4
+; GFX950-GISEL-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX950-GISEL-NEXT:    v_max_f32_e32 v4, v7, v7
+; GFX950-GISEL-NEXT:    v_max_f32_e32 v3, v3, v4
+; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_maximumnum_v4f32:
 ; GFX10-SDAG:       ; %bb.0:
@@ -7267,14 +7659,23 @@ define <4 x float> @v_maximumnum_v4f32_nnan(<4 x float> %x, <4 x float> %y) {
 ; GFX8-NEXT:    v_max_f32_e32 v3, v3, v7
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_maximumnum_v4f32_nnan:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f32_e32 v0, v0, v4
-; GFX9-NEXT:    v_max_f32_e32 v1, v1, v5
-; GFX9-NEXT:    v_max_f32_e32 v2, v2, v6
-; GFX9-NEXT:    v_max_f32_e32 v3, v3, v7
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_maximumnum_v4f32_nnan:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_max_f32_e32 v0, v0, v4
+; GFX900-NEXT:    v_max_f32_e32 v1, v1, v5
+; GFX900-NEXT:    v_max_f32_e32 v2, v2, v6
+; GFX900-NEXT:    v_max_f32_e32 v3, v3, v7
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_maximumnum_v4f32_nnan:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_max_f32_e32 v0, v0, v4
+; GFX950-NEXT:    v_max_f32_e32 v1, v1, v5
+; GFX950-NEXT:    v_max_f32_e32 v2, v2, v6
+; GFX950-NEXT:    v_max_f32_e32 v3, v3, v7
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximumnum_v4f32_nnan:
 ; GFX10:       ; %bb.0:
@@ -7376,12 +7777,12 @@ define <2 x double> @v_maximumnum_v2f64(<2 x double> %x, <2 x double> %y) {
 ; GFX950-SDAG-LABEL: v_maximumnum_v2f64:
 ; GFX950-SDAG:       ; %bb.0:
 ; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX950-SDAG-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX950-SDAG-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
 ; GFX950-SDAG-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX950-SDAG-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
 ; GFX950-SDAG-NEXT:    v_max_f64 v[0:1], v[0:1], v[4:5]
-; GFX950-SDAG-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX950-SDAG-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX950-SDAG-NEXT:    v_max_f64 v[2:3], v[2:3], v[4:5]
 ; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX950-GISEL-LABEL: v_maximumnum_v2f64:
@@ -7491,12 +7892,26 @@ define <2 x double> @v_maximumnum_v2f64_nnan(<2 x double> %x, <2 x double> %y) {
 ; GFX8-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_maximumnum_v2f64_nnan:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f64 v[0:1], v[0:1], v[4:5]
-; GFX9-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_maximumnum_v2f64_nnan:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_max_f64 v[0:1], v[0:1], v[4:5]
+; GFX900-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-SDAG-LABEL: v_maximumnum_v2f64_nnan:
+; GFX950-SDAG:       ; %bb.0:
+; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
+; GFX950-SDAG-NEXT:    v_max_f64 v[0:1], v[0:1], v[4:5]
+; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-GISEL-LABEL: v_maximumnum_v2f64_nnan:
+; GFX950-GISEL:       ; %bb.0:
+; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_max_f64 v[0:1], v[0:1], v[4:5]
+; GFX950-GISEL-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
+; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximumnum_v2f64_nnan:
 ; GFX10:       ; %bb.0:
@@ -7614,15 +8029,15 @@ define <3 x double> @v_maximumnum_v3f64(<3 x double> %x, <3 x double> %y) {
 ; GFX950-SDAG-LABEL: v_maximumnum_v3f64:
 ; GFX950-SDAG:       ; %bb.0:
 ; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
+; GFX950-SDAG-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX950-SDAG-NEXT:    v_max_f64 v[8:9], v[8:9], v[8:9]
+; GFX950-SDAG-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX950-SDAG-NEXT:    v_max_f64 v[6:7], v[6:7], v[6:7]
 ; GFX950-SDAG-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX950-SDAG-NEXT:    v_max_f64 v[4:5], v[4:5], v[10:11]
+; GFX950-SDAG-NEXT:    v_max_f64 v[2:3], v[2:3], v[8:9]
 ; GFX950-SDAG-NEXT:    v_max_f64 v[0:1], v[0:1], v[6:7]
-; GFX950-SDAG-NEXT:    v_max_f64 v[6:7], v[8:9], v[8:9]
-; GFX950-SDAG-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX950-SDAG-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
-; GFX950-SDAG-NEXT:    v_max_f64 v[6:7], v[10:11], v[10:11]
-; GFX950-SDAG-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
-; GFX950-SDAG-NEXT:    v_max_f64 v[4:5], v[4:5], v[6:7]
 ; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX950-GISEL-LABEL: v_maximumnum_v3f64:
@@ -7755,13 +8170,29 @@ define <3 x double> @v_maximumnum_v3f64_nnan(<3 x double> %x, <3 x double> %y) {
 ; GFX8-NEXT:    v_max_f64 v[4:5], v[4:5], v[10:11]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_maximumnum_v3f64_nnan:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f64 v[0:1], v[0:1], v[6:7]
-; GFX9-NEXT:    v_max_f64 v[2:3], v[2:3], v[8:9]
-; GFX9-NEXT:    v_max_f64 v[4:5], v[4:5], v[10:11]
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_maximumnum_v3f64_nnan:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_max_f64 v[0:1], v[0:1], v[6:7]
+; GFX900-NEXT:    v_max_f64 v[2:3], v[2:3], v[8:9]
+; GFX900-NEXT:    v_max_f64 v[4:5], v[4:5], v[10:11]
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-SDAG-LABEL: v_maximumnum_v3f64_nnan:
+; GFX950-SDAG:       ; %bb.0:
+; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_max_f64 v[4:5], v[4:5], v[10:11]
+; GFX950-SDAG-NEXT:    v_max_f64 v[2:3], v[2:3], v[8:9]
+; GFX950-SDAG-NEXT:    v_max_f64 v[0:1], v[0:1], v[6:7]
+; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-GISEL-LABEL: v_maximumnum_v3f64_nnan:
+; GFX950-GISEL:       ; %bb.0:
+; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_max_f64 v[0:1], v[0:1], v[6:7]
+; GFX950-GISEL-NEXT:    v_max_f64 v[2:3], v[2:3], v[8:9]
+; GFX950-GISEL-NEXT:    v_max_f64 v[4:5], v[4:5], v[10:11]
+; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximumnum_v3f64_nnan:
 ; GFX10:       ; %bb.0:
@@ -7900,18 +8331,18 @@ define <4 x double> @v_maximumnum_v4f64(<4 x double> %x, <4 x double> %y) {
 ; GFX950-SDAG-LABEL: v_maximumnum_v4f64:
 ; GFX950-SDAG:       ; %bb.0:
 ; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_max_f64 v[14:15], v[14:15], v[14:15]
+; GFX950-SDAG-NEXT:    v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX950-SDAG-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
+; GFX950-SDAG-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX950-SDAG-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
+; GFX950-SDAG-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX950-SDAG-NEXT:    v_max_f64 v[8:9], v[8:9], v[8:9]
 ; GFX950-SDAG-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX950-SDAG-NEXT:    v_max_f64 v[6:7], v[6:7], v[14:15]
+; GFX950-SDAG-NEXT:    v_max_f64 v[4:5], v[4:5], v[12:13]
+; GFX950-SDAG-NEXT:    v_max_f64 v[2:3], v[2:3], v[10:11]
 ; GFX950-SDAG-NEXT:    v_max_f64 v[0:1], v[0:1], v[8:9]
-; GFX950-SDAG-NEXT:    v_max_f64 v[8:9], v[10:11], v[10:11]
-; GFX950-SDAG-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX950-SDAG-NEXT:    v_max_f64 v[2:3], v[2:3], v[8:9]
-; GFX950-SDAG-NEXT:    v_max_f64 v[8:9], v[12:13], v[12:13]
-; GFX950-SDAG-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
-; GFX950-SDAG-NEXT:    v_max_f64 v[4:5], v[4:5], v[8:9]
-; GFX950-SDAG-NEXT:    v_max_f64 v[8:9], v[14:15], v[14:15]
-; GFX950-SDAG-NEXT:    v_max_f64 v[6:7], v[6:7], v[6:7]
-; GFX950-SDAG-NEXT:    v_max_f64 v[6:7], v[6:7], v[8:9]
 ; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX950-GISEL-LABEL: v_maximumnum_v4f64:
@@ -8067,14 +8498,32 @@ define <4 x double> @v_maximumnum_v4f64_nnan(<4 x double> %x, <4 x double> %y) {
 ; GFX8-NEXT:    v_max_f64 v[6:7], v[6:7], v[14:15]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_maximumnum_v4f64_nnan:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f64 v[0:1], v[0:1], v[8:9]
-; GFX9-NEXT:    v_max_f64 v[2:3], v[2:3], v[10:11]
-; GFX9-NEXT:    v_max_f64 v[4:5], v[4:5], v[12:13]
-; GFX9-NEXT:    v_max_f64 v[6:7], v[6:7], v[14:15]
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_maximumnum_v4f64_nnan:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_max_f64 v[0:1], v[0:1], v[8:9]
+; GFX900-NEXT:    v_max_f64 v[2:3], v[2:3], v[10:11]
+; GFX900-NEXT:    v_max_f64 v[4:5], v[4:5], v[12:13]
+; GFX900-NEXT:    v_max_f64 v[6:7], v[6:7], v[14:15]
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-SDAG-LABEL: v_maximumnum_v4f64_nnan:
+; GFX950-SDAG:       ; %bb.0:
+; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_max_f64 v[6:7], v[6:7], v[14:15]
+; GFX950-SDAG-NEXT:    v_max_f64 v[4:5], v[4:5], v[12:13]
+; GFX950-SDAG-NEXT:    v_max_f64 v[2:3], v[2:3], v[10:11]
+; GFX950-SDAG-NEXT:    v_max_f64 v[0:1], v[0:1], v[8:9]
+; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-GISEL-LABEL: v_maximumnum_v4f64_nnan:
+; GFX950-GISEL:       ; %bb.0:
+; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_max_f64 v[0:1], v[0:1], v[8:9]
+; GFX950-GISEL-NEXT:    v_max_f64 v[2:3], v[2:3], v[10:11]
+; GFX950-GISEL-NEXT:    v_max_f64 v[4:5], v[4:5], v[12:13]
+; GFX950-GISEL-NEXT:    v_max_f64 v[6:7], v[6:7], v[14:15]
+; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximumnum_v4f64_nnan:
 ; GFX10:       ; %bb.0:
@@ -8136,11 +8585,17 @@ define half @v_maximumnum_f16_no_ieee(half %x, half %y) #0 {
 ; GFX8-NEXT:    v_max_f16_e32 v0, v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_maximumnum_f16_no_ieee:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f16_e32 v0, v0, v1
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_maximumnum_f16_no_ieee:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_maximumnum_f16_no_ieee:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximumnum_f16_no_ieee:
 ; GFX10:       ; %bb.0:
@@ -8241,11 +8696,17 @@ define half @v_maximumnum_f16_nan_no_ieee(half %x, half %y) #0 {
 ; GFX8-NEXT:    v_max_f16_e32 v0, v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_maximumnum_f16_nan_no_ieee:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f16_e32 v0, v0, v1
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_maximumnum_f16_nan_no_ieee:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_maximumnum_f16_nan_no_ieee:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximumnum_f16_nan_no_ieee:
 ; GFX10:       ; %bb.0:
@@ -8301,11 +8762,17 @@ define float @v_maximumnum_f32_no_ieee(float %x, float %y) #0 {
 ; GFX8-NEXT:    v_max_f32_e32 v0, v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_maximumnum_f32_no_ieee:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_maximumnum_f32_no_ieee:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_maximumnum_f32_no_ieee:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximumnum_f32_no_ieee:
 ; GFX10:       ; %bb.0:
@@ -8359,11 +8826,17 @@ define float @v_maximumnum_f32_nnan_no_ieee(float %x, float %y) #0 {
 ; GFX8-NEXT:    v_max_f32_e32 v0, v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_maximumnum_f32_nnan_no_ieee:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_maximumnum_f32_nnan_no_ieee:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_maximumnum_f32_nnan_no_ieee:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximumnum_f32_nnan_no_ieee:
 ; GFX10:       ; %bb.0:
@@ -8403,11 +8876,17 @@ define double @v_maximumnum_f64_no_ieee(double %x, double %y) #0 {
 ; GFX8-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_maximumnum_f64_no_ieee:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_maximumnum_f64_no_ieee:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_maximumnum_f64_no_ieee:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximumnum_f64_no_ieee:
 ; GFX10:       ; %bb.0:
@@ -8463,11 +8942,17 @@ define double @v_maximumnum_f64_nnan_no_ieee(double %x, double %y) #0 {
 ; GFX8-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_maximumnum_f64_nnan_no_ieee:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_maximumnum_f64_nnan_no_ieee:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_maximumnum_f64_nnan_no_ieee:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximumnum_f64_nnan_no_ieee:
 ; GFX10:       ; %bb.0:
@@ -8539,11 +9024,17 @@ define <2 x half> @v_maximumnum_v2f16_no_ieee(<2 x half> %x, <2 x half> %y) #0 {
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v2, v0
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_maximumnum_v2f16_no_ieee:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_pk_max_f16 v0, v0, v1
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_maximumnum_v2f16_no_ieee:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_pk_max_f16 v0, v0, v1
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_maximumnum_v2f16_no_ieee:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_pk_max_f16 v0, v0, v1
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximumnum_v2f16_no_ieee:
 ; GFX10:       ; %bb.0:
@@ -8631,11 +9122,17 @@ define <2 x half> @v_maximumnum_v2f16_nnan_no_ieee(<2 x half> %x, <2 x half> %y)
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v2, v0
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_maximumnum_v2f16_nnan_no_ieee:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_pk_max_f16 v0, v0, v1
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_maximumnum_v2f16_nnan_no_ieee:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_pk_max_f16 v0, v0, v1
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_maximumnum_v2f16_nnan_no_ieee:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_pk_max_f16 v0, v0, v1
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximumnum_v2f16_nnan_no_ieee:
 ; GFX10:       ; %bb.0:
@@ -8718,19 +9215,33 @@ define <3 x half> @v_maximumnum_v3f16_nnan_no_ieee(<3 x half> %x, <3 x half> %y)
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v4, v0
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-SDAG-LABEL: v_maximumnum_v3f16_nnan_no_ieee:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_pk_max_f16 v1, v1, v3
-; GFX9-SDAG-NEXT:    v_pk_max_f16 v0, v0, v2
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-SDAG-LABEL: v_maximumnum_v3f16_nnan_no_ieee:
+; GFX900-SDAG:       ; %bb.0:
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-SDAG-NEXT:    v_pk_max_f16 v1, v1, v3
+; GFX900-SDAG-NEXT:    v_pk_max_f16 v0, v0, v2
+; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-GISEL-LABEL: v_maximumnum_v3f16_nnan_no_ieee:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v0, v0, v2
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v1, v1, v3
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-SDAG-LABEL: v_maximumnum_v3f16_nnan_no_ieee:
+; GFX950-SDAG:       ; %bb.0:
+; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_pk_max_f16 v1, v1, v3
+; GFX950-SDAG-NEXT:    v_pk_max_f16 v0, v0, v2
+; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-GISEL-LABEL: v_maximumnum_v3f16_nnan_no_ieee:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v0, v0, v2
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v1, v1, v3
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX950-GISEL-LABEL: v_maximumnum_v3f16_nnan_no_ieee:
+; GFX950-GISEL:       ; %bb.0:
+; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v0, v0, v2
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v1, v1, v3
+; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximumnum_v3f16_nnan_no_ieee:
 ; GFX10:       ; %bb.0:
@@ -8829,12 +9340,19 @@ define <4 x half> @v_maximumnum_v4f16_nnan_no_ieee(<4 x half> %x, <4 x half> %y)
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v1, v2, v1
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_maximumnum_v4f16_nnan_no_ieee:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_pk_max_f16 v0, v0, v2
-; GFX9-NEXT:    v_pk_max_f16 v1, v1, v3
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_maximumnum_v4f16_nnan_no_ieee:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_pk_max_f16 v0, v0, v2
+; GFX900-NEXT:    v_pk_max_f16 v1, v1, v3
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_maximumnum_v4f16_nnan_no_ieee:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_pk_max_f16 v0, v0, v2
+; GFX950-NEXT:    v_pk_max_f16 v1, v1, v3
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximumnum_v4f16_nnan_no_ieee:
 ; GFX10:       ; %bb.0:
@@ -8865,6 +9383,3 @@ define <4 x half> @v_maximumnum_v4f16_nnan_no_ieee(<4 x half> %x, <4 x half> %y)
 }
 
 attributes #0 = { "amdgpu-ieee"="false" }
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; GFX900: {{.*}}
-; GFX950: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/minimumnum.ll b/llvm/test/CodeGen/AMDGPU/minimumnum.ll
index 558006d2b6957..9c4a1ca797110 100644
--- a/llvm/test/CodeGen/AMDGPU/minimumnum.ll
+++ b/llvm/test/CodeGen/AMDGPU/minimumnum.ll
@@ -5,11 +5,11 @@
 ; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %s | FileCheck -check-prefixes=GFX8,GFX8-SDAG %s
 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %s | FileCheck -check-prefixes=GFX8,GFX8-GISEL %s
 
-; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX900,GFX9-SDAG,GFX900-SDAG %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX900,GFX9-GISEL,GFX900-GISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX900,GFX900-SDAG %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX900,GFX900-GISEL %s
 
-; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 < %s | FileCheck -check-prefixes=GFX9,GFX950,GFX9-SDAG,GFX950-SDAG %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 < %s | FileCheck -check-prefixes=GFX9,GFX950,GFX9-GISEL,GFX950-GISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 < %s | FileCheck -check-prefixes=GFX950,GFX950-SDAG %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 < %s | FileCheck -check-prefixes=GFX950,GFX950-GISEL %s
 
 ; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck -check-prefixes=GFX10,GFX10-SDAG %s
 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck -check-prefixes=GFX10,GFX10-GISEL %s
@@ -62,21 +62,37 @@ define half @v_minimumnum_f16(half %x, half %y) {
 ; GFX8-GISEL-NEXT:    v_min_f16_e32 v0, v0, v1
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-SDAG-LABEL: v_minimumnum_f16:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX9-SDAG-NEXT:    v_max_f16_e32 v0, v0, v0
-; GFX9-SDAG-NEXT:    v_min_f16_e32 v0, v0, v1
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: v_minimumnum_f16:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_max_f16_e32 v0, v0, v0
-; GFX9-GISEL-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX9-GISEL-NEXT:    v_min_f16_e32 v0, v0, v1
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-SDAG-LABEL: v_minimumnum_f16:
+; GFX900-SDAG:       ; %bb.0:
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-SDAG-NEXT:    v_max_f16_e32 v1, v1, v1
+; GFX900-SDAG-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX900-SDAG-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-GISEL-LABEL: v_minimumnum_f16:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX900-GISEL-NEXT:    v_max_f16_e32 v1, v1, v1
+; GFX900-GISEL-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-SDAG-LABEL: v_minimumnum_f16:
+; GFX950-SDAG:       ; %bb.0:
+; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_max_f16_e32 v1, v1, v1
+; GFX950-SDAG-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX950-SDAG-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-GISEL-LABEL: v_minimumnum_f16:
+; GFX950-GISEL:       ; %bb.0:
+; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX950-GISEL-NEXT:    v_max_f16_e32 v1, v1, v1
+; GFX950-GISEL-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_minimumnum_f16:
 ; GFX10-SDAG:       ; %bb.0:
@@ -211,11 +227,17 @@ define half @v_minimumnum_f16_nnan(half %x, half %y) {
 ; GFX8-NEXT:    v_min_f16_e32 v0, v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_minimumnum_f16_nnan:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_min_f16_e32 v0, v0, v1
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_minimumnum_f16_nnan:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_minimumnum_f16_nnan:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimumnum_f16_nnan:
 ; GFX10:       ; %bb.0:
@@ -283,12 +305,19 @@ define half @v_minimumnum_f16_1.0(half %x) {
 ; GFX8-NEXT:    v_min_f16_e32 v0, 1.0, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_minimumnum_f16_1.0:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f16_e32 v0, v0, v0
-; GFX9-NEXT:    v_min_f16_e32 v0, 1.0, v0
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_minimumnum_f16_1.0:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX900-NEXT:    v_min_f16_e32 v0, 1.0, v0
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_minimumnum_f16_1.0:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX950-NEXT:    v_min_f16_e32 v0, 1.0, v0
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimumnum_f16_1.0:
 ; GFX10:       ; %bb.0:
@@ -373,21 +402,37 @@ define float @v_minimumnum_f32(float %x, float %y) {
 ; GFX8-GISEL-NEXT:    v_min_f32_e32 v0, v0, v1
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-SDAG-LABEL: v_minimumnum_f32:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX9-SDAG-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX9-SDAG-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: v_minimumnum_f32:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX9-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX9-GISEL-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-SDAG-LABEL: v_minimumnum_f32:
+; GFX900-SDAG:       ; %bb.0:
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-SDAG-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX900-SDAG-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX900-SDAG-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-GISEL-LABEL: v_minimumnum_f32:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX900-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX900-GISEL-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-SDAG-LABEL: v_minimumnum_f32:
+; GFX950-SDAG:       ; %bb.0:
+; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX950-SDAG-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX950-SDAG-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-GISEL-LABEL: v_minimumnum_f32:
+; GFX950-GISEL:       ; %bb.0:
+; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX950-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX950-GISEL-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_minimumnum_f32:
 ; GFX10-SDAG:       ; %bb.0:
@@ -461,11 +506,17 @@ define float @v_minimumnum_f32_nnan(float %x, float %y) {
 ; GFX8-NEXT:    v_min_f32_e32 v0, v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_minimumnum_f32_nnan:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_minimumnum_f32_nnan:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_minimumnum_f32_nnan:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimumnum_f32_nnan:
 ; GFX10:       ; %bb.0:
@@ -525,21 +576,37 @@ define double @v_minimumnum_f64(double %x, double %y) {
 ; GFX8-GISEL-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-SDAG-LABEL: v_minimumnum_f64:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX9-SDAG-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
-; GFX9-SDAG-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: v_minimumnum_f64:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
-; GFX9-GISEL-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX9-GISEL-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-SDAG-LABEL: v_minimumnum_f64:
+; GFX900-SDAG:       ; %bb.0:
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-SDAG-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX900-SDAG-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX900-SDAG-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
+; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-GISEL-LABEL: v_minimumnum_f64:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX900-GISEL-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX900-GISEL-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-SDAG-LABEL: v_minimumnum_f64:
+; GFX950-SDAG:       ; %bb.0:
+; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX950-SDAG-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX950-SDAG-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
+; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-GISEL-LABEL: v_minimumnum_f64:
+; GFX950-GISEL:       ; %bb.0:
+; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX950-GISEL-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX950-GISEL-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
+; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_minimumnum_f64:
 ; GFX10-SDAG:       ; %bb.0:
@@ -617,11 +684,17 @@ define double @v_minimumnum_f64_nnan(double %x, double %y) {
 ; GFX8-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_minimumnum_f64_nnan:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_minimumnum_f64_nnan:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_minimumnum_f64_nnan:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimumnum_f64_nnan:
 ; GFX10:       ; %bb.0:
@@ -663,12 +736,19 @@ define float @v_minimumnum_f32_1.0(float %x) {
 ; GFX8-NEXT:    v_min_f32_e32 v0, 1.0, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_minimumnum_f32_1.0:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX9-NEXT:    v_min_f32_e32 v0, 1.0, v0
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_minimumnum_f32_1.0:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX900-NEXT:    v_min_f32_e32 v0, 1.0, v0
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_minimumnum_f32_1.0:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX950-NEXT:    v_min_f32_e32 v0, 1.0, v0
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimumnum_f32_1.0:
 ; GFX10:       ; %bb.0:
@@ -717,13 +797,21 @@ define float @v_minimumnum_f32_rhs_not_snan(float %x, float %y) {
 ; GFX8-NEXT:    v_min_f32_e32 v0, v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_minimumnum_f32_rhs_not_snan:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX9-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_minimumnum_f32_rhs_not_snan:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX900-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX900-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_minimumnum_f32_rhs_not_snan:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX950-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX950-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimumnum_f32_rhs_not_snan:
 ; GFX10:       ; %bb.0:
@@ -774,13 +862,21 @@ define float @v_minimumnum_f32_lhs_not_snan(float %x, float %y) {
 ; GFX8-NEXT:    v_min_f32_e32 v0, v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_minimumnum_f32_lhs_not_snan:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX9-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_minimumnum_f32_lhs_not_snan:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX900-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX900-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_minimumnum_f32_lhs_not_snan:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX950-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX950-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimumnum_f32_lhs_not_snan:
 ; GFX10:       ; %bb.0:
@@ -831,13 +927,21 @@ define float @v_minimumnum_f32_both_operands_not_snan(float %x, float %y) {
 ; GFX8-NEXT:    v_min_f32_e32 v0, v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_minimumnum_f32_both_operands_not_snan:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX9-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_minimumnum_f32_both_operands_not_snan:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX900-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX900-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_minimumnum_f32_both_operands_not_snan:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX950-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX950-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimumnum_f32_both_operands_not_snan:
 ; GFX10:       ; %bb.0:
@@ -887,12 +991,19 @@ define double @v_minimumnum_f64_1.0(double %x) {
 ; GFX8-NEXT:    v_min_f64 v[0:1], v[0:1], 1.0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_minimumnum_f64_1.0:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
-; GFX9-NEXT:    v_min_f64 v[0:1], v[0:1], 1.0
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_minimumnum_f64_1.0:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX900-NEXT:    v_min_f64 v[0:1], v[0:1], 1.0
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_minimumnum_f64_1.0:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX950-NEXT:    v_min_f64 v[0:1], v[0:1], 1.0
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimumnum_f64_1.0:
 ; GFX10:       ; %bb.0:
@@ -2015,21 +2126,37 @@ define float @v_minimumnum_f32_fabs_rhs(float %x, float %y) {
 ; GFX8-GISEL-NEXT:    v_min_f32_e32 v0, v0, v1
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-SDAG-LABEL: v_minimumnum_f32_fabs_rhs:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_max_f32_e64 v1, |v1|, |v1|
-; GFX9-SDAG-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX9-SDAG-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: v_minimumnum_f32_fabs_rhs:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX9-GISEL-NEXT:    v_max_f32_e64 v1, |v1|, |v1|
-; GFX9-GISEL-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-SDAG-LABEL: v_minimumnum_f32_fabs_rhs:
+; GFX900-SDAG:       ; %bb.0:
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-SDAG-NEXT:    v_max_f32_e64 v1, |v1|, |v1|
+; GFX900-SDAG-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX900-SDAG-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-GISEL-LABEL: v_minimumnum_f32_fabs_rhs:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX900-GISEL-NEXT:    v_max_f32_e64 v1, |v1|, |v1|
+; GFX900-GISEL-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-SDAG-LABEL: v_minimumnum_f32_fabs_rhs:
+; GFX950-SDAG:       ; %bb.0:
+; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_max_f32_e64 v1, |v1|, |v1|
+; GFX950-SDAG-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX950-SDAG-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-GISEL-LABEL: v_minimumnum_f32_fabs_rhs:
+; GFX950-GISEL:       ; %bb.0:
+; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX950-GISEL-NEXT:    v_max_f32_e64 v1, |v1|, |v1|
+; GFX950-GISEL-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_minimumnum_f32_fabs_rhs:
 ; GFX10-SDAG:       ; %bb.0:
@@ -2128,21 +2255,37 @@ define float @v_minimumnum_f32_fneg_fabs_rhs(float %x, float %y) {
 ; GFX8-GISEL-NEXT:    v_min_f32_e32 v0, v0, v1
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-SDAG-LABEL: v_minimumnum_f32_fneg_fabs_rhs:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_max_f32_e64 v1, -|v1|, -|v1|
-; GFX9-SDAG-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX9-SDAG-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: v_minimumnum_f32_fneg_fabs_rhs:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX9-GISEL-NEXT:    v_max_f32_e64 v1, -|v1|, -|v1|
-; GFX9-GISEL-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-SDAG-LABEL: v_minimumnum_f32_fneg_fabs_rhs:
+; GFX900-SDAG:       ; %bb.0:
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-SDAG-NEXT:    v_max_f32_e64 v1, -|v1|, -|v1|
+; GFX900-SDAG-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX900-SDAG-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-GISEL-LABEL: v_minimumnum_f32_fneg_fabs_rhs:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX900-GISEL-NEXT:    v_max_f32_e64 v1, -|v1|, -|v1|
+; GFX900-GISEL-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-SDAG-LABEL: v_minimumnum_f32_fneg_fabs_rhs:
+; GFX950-SDAG:       ; %bb.0:
+; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_max_f32_e64 v1, -|v1|, -|v1|
+; GFX950-SDAG-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX950-SDAG-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-GISEL-LABEL: v_minimumnum_f32_fneg_fabs_rhs:
+; GFX950-GISEL:       ; %bb.0:
+; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX950-GISEL-NEXT:    v_max_f32_e64 v1, -|v1|, -|v1|
+; GFX950-GISEL-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_minimumnum_f32_fneg_fabs_rhs:
 ; GFX10-SDAG:       ; %bb.0:
@@ -2242,21 +2385,37 @@ define float @v_minimumnum_f32_fabs(float %x, float %y) {
 ; GFX8-GISEL-NEXT:    v_min_f32_e32 v0, v0, v1
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-SDAG-LABEL: v_minimumnum_f32_fabs:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_max_f32_e64 v1, |v1|, |v1|
-; GFX9-SDAG-NEXT:    v_max_f32_e64 v0, |v0|, |v0|
-; GFX9-SDAG-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: v_minimumnum_f32_fabs:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_max_f32_e64 v0, |v0|, |v0|
-; GFX9-GISEL-NEXT:    v_max_f32_e64 v1, |v1|, |v1|
-; GFX9-GISEL-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-SDAG-LABEL: v_minimumnum_f32_fabs:
+; GFX900-SDAG:       ; %bb.0:
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-SDAG-NEXT:    v_max_f32_e64 v1, |v1|, |v1|
+; GFX900-SDAG-NEXT:    v_max_f32_e64 v0, |v0|, |v0|
+; GFX900-SDAG-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-GISEL-LABEL: v_minimumnum_f32_fabs:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_max_f32_e64 v0, |v0|, |v0|
+; GFX900-GISEL-NEXT:    v_max_f32_e64 v1, |v1|, |v1|
+; GFX900-GISEL-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-SDAG-LABEL: v_minimumnum_f32_fabs:
+; GFX950-SDAG:       ; %bb.0:
+; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_max_f32_e64 v1, |v1|, |v1|
+; GFX950-SDAG-NEXT:    v_max_f32_e64 v0, |v0|, |v0|
+; GFX950-SDAG-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-GISEL-LABEL: v_minimumnum_f32_fabs:
+; GFX950-GISEL:       ; %bb.0:
+; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_max_f32_e64 v0, |v0|, |v0|
+; GFX950-GISEL-NEXT:    v_max_f32_e64 v1, |v1|, |v1|
+; GFX950-GISEL-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_minimumnum_f32_fabs:
 ; GFX10-SDAG:       ; %bb.0:
@@ -2356,21 +2515,37 @@ define float @v_minimumnum_f32_fneg(float %x, float %y) {
 ; GFX8-GISEL-NEXT:    v_min_f32_e32 v0, v0, v1
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-SDAG-LABEL: v_minimumnum_f32_fneg:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_max_f32_e64 v1, -v1, -v1
-; GFX9-SDAG-NEXT:    v_max_f32_e64 v0, -v0, -v0
-; GFX9-SDAG-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: v_minimumnum_f32_fneg:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_max_f32_e64 v0, -v0, -v0
-; GFX9-GISEL-NEXT:    v_max_f32_e64 v1, -v1, -v1
-; GFX9-GISEL-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-SDAG-LABEL: v_minimumnum_f32_fneg:
+; GFX900-SDAG:       ; %bb.0:
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-SDAG-NEXT:    v_max_f32_e64 v1, -v1, -v1
+; GFX900-SDAG-NEXT:    v_max_f32_e64 v0, -v0, -v0
+; GFX900-SDAG-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-GISEL-LABEL: v_minimumnum_f32_fneg:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_max_f32_e64 v0, -v0, -v0
+; GFX900-GISEL-NEXT:    v_max_f32_e64 v1, -v1, -v1
+; GFX900-GISEL-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-SDAG-LABEL: v_minimumnum_f32_fneg:
+; GFX950-SDAG:       ; %bb.0:
+; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_max_f32_e64 v1, -v1, -v1
+; GFX950-SDAG-NEXT:    v_max_f32_e64 v0, -v0, -v0
+; GFX950-SDAG-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-GISEL-LABEL: v_minimumnum_f32_fneg:
+; GFX950-GISEL:       ; %bb.0:
+; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_max_f32_e64 v0, -v0, -v0
+; GFX950-GISEL-NEXT:    v_max_f32_e64 v1, -v1, -v1
+; GFX950-GISEL-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_minimumnum_f32_fneg:
 ; GFX10-SDAG:       ; %bb.0:
@@ -2473,21 +2648,37 @@ define half @v_minimumnum_f16_fabs_rhs(half %x, half %y) {
 ; GFX8-GISEL-NEXT:    v_min_f16_e32 v0, v0, v1
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-SDAG-LABEL: v_minimumnum_f16_fabs_rhs:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_max_f16_e64 v1, |v1|, |v1|
-; GFX9-SDAG-NEXT:    v_max_f16_e32 v0, v0, v0
-; GFX9-SDAG-NEXT:    v_min_f16_e32 v0, v0, v1
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: v_minimumnum_f16_fabs_rhs:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_max_f16_e32 v0, v0, v0
-; GFX9-GISEL-NEXT:    v_max_f16_e64 v1, |v1|, |v1|
-; GFX9-GISEL-NEXT:    v_min_f16_e32 v0, v0, v1
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-SDAG-LABEL: v_minimumnum_f16_fabs_rhs:
+; GFX900-SDAG:       ; %bb.0:
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-SDAG-NEXT:    v_max_f16_e64 v1, |v1|, |v1|
+; GFX900-SDAG-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX900-SDAG-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-GISEL-LABEL: v_minimumnum_f16_fabs_rhs:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX900-GISEL-NEXT:    v_max_f16_e64 v1, |v1|, |v1|
+; GFX900-GISEL-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-SDAG-LABEL: v_minimumnum_f16_fabs_rhs:
+; GFX950-SDAG:       ; %bb.0:
+; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_max_f16_e64 v1, |v1|, |v1|
+; GFX950-SDAG-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX950-SDAG-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-GISEL-LABEL: v_minimumnum_f16_fabs_rhs:
+; GFX950-GISEL:       ; %bb.0:
+; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX950-GISEL-NEXT:    v_max_f16_e64 v1, |v1|, |v1|
+; GFX950-GISEL-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_minimumnum_f16_fabs_rhs:
 ; GFX10-SDAG:       ; %bb.0:
@@ -2633,21 +2824,37 @@ define half @v_minimumnum_f16_fneg_fabs_rhs(half %x, half %y) {
 ; GFX8-GISEL-NEXT:    v_min_f16_e32 v0, v0, v1
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-SDAG-LABEL: v_minimumnum_f16_fneg_fabs_rhs:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_max_f16_e64 v1, -|v1|, -|v1|
-; GFX9-SDAG-NEXT:    v_max_f16_e32 v0, v0, v0
-; GFX9-SDAG-NEXT:    v_min_f16_e32 v0, v0, v1
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: v_minimumnum_f16_fneg_fabs_rhs:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_max_f16_e32 v0, v0, v0
-; GFX9-GISEL-NEXT:    v_max_f16_e64 v1, -|v1|, -|v1|
-; GFX9-GISEL-NEXT:    v_min_f16_e32 v0, v0, v1
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-SDAG-LABEL: v_minimumnum_f16_fneg_fabs_rhs:
+; GFX900-SDAG:       ; %bb.0:
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-SDAG-NEXT:    v_max_f16_e64 v1, -|v1|, -|v1|
+; GFX900-SDAG-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX900-SDAG-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-GISEL-LABEL: v_minimumnum_f16_fneg_fabs_rhs:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX900-GISEL-NEXT:    v_max_f16_e64 v1, -|v1|, -|v1|
+; GFX900-GISEL-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-SDAG-LABEL: v_minimumnum_f16_fneg_fabs_rhs:
+; GFX950-SDAG:       ; %bb.0:
+; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_max_f16_e64 v1, -|v1|, -|v1|
+; GFX950-SDAG-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX950-SDAG-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-GISEL-LABEL: v_minimumnum_f16_fneg_fabs_rhs:
+; GFX950-GISEL:       ; %bb.0:
+; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX950-GISEL-NEXT:    v_max_f16_e64 v1, -|v1|, -|v1|
+; GFX950-GISEL-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_minimumnum_f16_fneg_fabs_rhs:
 ; GFX10-SDAG:       ; %bb.0:
@@ -2794,21 +3001,37 @@ define half @v_minimumnum_f16_fabs(half %x, half %y) {
 ; GFX8-GISEL-NEXT:    v_min_f16_e32 v0, v0, v1
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-SDAG-LABEL: v_minimumnum_f16_fabs:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_max_f16_e64 v1, |v1|, |v1|
-; GFX9-SDAG-NEXT:    v_max_f16_e64 v0, |v0|, |v0|
-; GFX9-SDAG-NEXT:    v_min_f16_e32 v0, v0, v1
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: v_minimumnum_f16_fabs:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_max_f16_e64 v0, |v0|, |v0|
-; GFX9-GISEL-NEXT:    v_max_f16_e64 v1, |v1|, |v1|
-; GFX9-GISEL-NEXT:    v_min_f16_e32 v0, v0, v1
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-SDAG-LABEL: v_minimumnum_f16_fabs:
+; GFX900-SDAG:       ; %bb.0:
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-SDAG-NEXT:    v_max_f16_e64 v1, |v1|, |v1|
+; GFX900-SDAG-NEXT:    v_max_f16_e64 v0, |v0|, |v0|
+; GFX900-SDAG-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-GISEL-LABEL: v_minimumnum_f16_fabs:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_max_f16_e64 v0, |v0|, |v0|
+; GFX900-GISEL-NEXT:    v_max_f16_e64 v1, |v1|, |v1|
+; GFX900-GISEL-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-SDAG-LABEL: v_minimumnum_f16_fabs:
+; GFX950-SDAG:       ; %bb.0:
+; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_max_f16_e64 v1, |v1|, |v1|
+; GFX950-SDAG-NEXT:    v_max_f16_e64 v0, |v0|, |v0|
+; GFX950-SDAG-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-GISEL-LABEL: v_minimumnum_f16_fabs:
+; GFX950-GISEL:       ; %bb.0:
+; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_max_f16_e64 v0, |v0|, |v0|
+; GFX950-GISEL-NEXT:    v_max_f16_e64 v1, |v1|, |v1|
+; GFX950-GISEL-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_minimumnum_f16_fabs:
 ; GFX10-SDAG:       ; %bb.0:
@@ -2955,21 +3178,37 @@ define half @v_minimumnum_f16_fneg(half %x, half %y) {
 ; GFX8-GISEL-NEXT:    v_min_f16_e32 v0, v0, v1
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-SDAG-LABEL: v_minimumnum_f16_fneg:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_max_f16_e64 v1, -v1, -v1
-; GFX9-SDAG-NEXT:    v_max_f16_e64 v0, -v0, -v0
-; GFX9-SDAG-NEXT:    v_min_f16_e32 v0, v0, v1
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: v_minimumnum_f16_fneg:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_max_f16_e64 v0, -v0, -v0
-; GFX9-GISEL-NEXT:    v_max_f16_e64 v1, -v1, -v1
-; GFX9-GISEL-NEXT:    v_min_f16_e32 v0, v0, v1
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-SDAG-LABEL: v_minimumnum_f16_fneg:
+; GFX900-SDAG:       ; %bb.0:
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-SDAG-NEXT:    v_max_f16_e64 v1, -v1, -v1
+; GFX900-SDAG-NEXT:    v_max_f16_e64 v0, -v0, -v0
+; GFX900-SDAG-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-GISEL-LABEL: v_minimumnum_f16_fneg:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_max_f16_e64 v0, -v0, -v0
+; GFX900-GISEL-NEXT:    v_max_f16_e64 v1, -v1, -v1
+; GFX900-GISEL-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-SDAG-LABEL: v_minimumnum_f16_fneg:
+; GFX950-SDAG:       ; %bb.0:
+; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_max_f16_e64 v1, -v1, -v1
+; GFX950-SDAG-NEXT:    v_max_f16_e64 v0, -v0, -v0
+; GFX950-SDAG-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-GISEL-LABEL: v_minimumnum_f16_fneg:
+; GFX950-GISEL:       ; %bb.0:
+; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_max_f16_e64 v0, -v0, -v0
+; GFX950-GISEL-NEXT:    v_max_f16_e64 v1, -v1, -v1
+; GFX950-GISEL-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_minimumnum_f16_fneg:
 ; GFX10-SDAG:       ; %bb.0:
@@ -3113,21 +3352,37 @@ define double @v_minimumnum_f64_fneg(double %x, double %y) {
 ; GFX8-GISEL-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-SDAG-LABEL: v_minimumnum_f64_fneg:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_max_f64 v[2:3], -v[2:3], -v[2:3]
-; GFX9-SDAG-NEXT:    v_max_f64 v[0:1], -v[0:1], -v[0:1]
-; GFX9-SDAG-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: v_minimumnum_f64_fneg:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_max_f64 v[0:1], -v[0:1], -v[0:1]
-; GFX9-GISEL-NEXT:    v_max_f64 v[2:3], -v[2:3], -v[2:3]
-; GFX9-GISEL-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-SDAG-LABEL: v_minimumnum_f64_fneg:
+; GFX900-SDAG:       ; %bb.0:
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-SDAG-NEXT:    v_max_f64 v[2:3], -v[2:3], -v[2:3]
+; GFX900-SDAG-NEXT:    v_max_f64 v[0:1], -v[0:1], -v[0:1]
+; GFX900-SDAG-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
+; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-GISEL-LABEL: v_minimumnum_f64_fneg:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_max_f64 v[0:1], -v[0:1], -v[0:1]
+; GFX900-GISEL-NEXT:    v_max_f64 v[2:3], -v[2:3], -v[2:3]
+; GFX900-GISEL-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-SDAG-LABEL: v_minimumnum_f64_fneg:
+; GFX950-SDAG:       ; %bb.0:
+; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_max_f64 v[2:3], -v[2:3], -v[2:3]
+; GFX950-SDAG-NEXT:    v_max_f64 v[0:1], -v[0:1], -v[0:1]
+; GFX950-SDAG-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
+; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-GISEL-LABEL: v_minimumnum_f64_fneg:
+; GFX950-GISEL:       ; %bb.0:
+; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_max_f64 v[0:1], -v[0:1], -v[0:1]
+; GFX950-GISEL-NEXT:    v_max_f64 v[2:3], -v[2:3], -v[2:3]
+; GFX950-GISEL-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
+; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_minimumnum_f64_fneg:
 ; GFX10-SDAG:       ; %bb.0:
@@ -3389,11 +3644,17 @@ define <2 x half> @v_minimumnum_v2f16_nnan(<2 x half> %x, <2 x half> %y) {
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v2, v0
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_minimumnum_v2f16_nnan:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_pk_min_f16 v0, v0, v1
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_minimumnum_v2f16_nnan:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_pk_min_f16 v0, v0, v1
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_minimumnum_v2f16_nnan:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_pk_min_f16 v0, v0, v1
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimumnum_v2f16_nnan:
 ; GFX10:       ; %bb.0:
@@ -3488,16 +3749,16 @@ define <3 x half> @v_minimumnum_v3f16(<3 x half> %x, <3 x half> %y) {
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v4, v0
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-SDAG-LABEL: v_minimumnum_v3f16:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_pk_max_f16 v3, v3, v3
-; GFX9-SDAG-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX9-SDAG-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX9-SDAG-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX9-SDAG-NEXT:    v_pk_min_f16 v1, v1, v3
-; GFX9-SDAG-NEXT:    v_pk_min_f16 v0, v0, v2
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-SDAG-LABEL: v_minimumnum_v3f16:
+; GFX900-SDAG:       ; %bb.0:
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-SDAG-NEXT:    v_pk_max_f16 v3, v3, v3
+; GFX900-SDAG-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX900-SDAG-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX900-SDAG-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX900-SDAG-NEXT:    v_pk_min_f16 v1, v1, v3
+; GFX900-SDAG-NEXT:    v_pk_min_f16 v0, v0, v2
+; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_minimumnum_v3f16:
 ; GFX900-GISEL:       ; %bb.0:
@@ -3510,6 +3771,17 @@ define <3 x half> @v_minimumnum_v3f16(<3 x half> %x, <3 x half> %y) {
 ; GFX900-GISEL-NEXT:    v_pk_min_f16 v1, v1, v2
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX950-SDAG-LABEL: v_minimumnum_v3f16:
+; GFX950-SDAG:       ; %bb.0:
+; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_pk_max_f16 v3, v3, v3
+; GFX950-SDAG-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX950-SDAG-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX950-SDAG-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX950-SDAG-NEXT:    v_pk_min_f16 v1, v1, v3
+; GFX950-SDAG-NEXT:    v_pk_min_f16 v0, v0, v2
+; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX950-GISEL-LABEL: v_minimumnum_v3f16:
 ; GFX950-GISEL:       ; %bb.0:
 ; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3659,19 +3931,33 @@ define <3 x half> @v_minimumnum_v3f16_nnan(<3 x half> %x, <3 x half> %y) {
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v4, v0
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-SDAG-LABEL: v_minimumnum_v3f16_nnan:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_pk_min_f16 v1, v1, v3
-; GFX9-SDAG-NEXT:    v_pk_min_f16 v0, v0, v2
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-SDAG-LABEL: v_minimumnum_v3f16_nnan:
+; GFX900-SDAG:       ; %bb.0:
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-SDAG-NEXT:    v_pk_min_f16 v1, v1, v3
+; GFX900-SDAG-NEXT:    v_pk_min_f16 v0, v0, v2
+; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-GISEL-LABEL: v_minimumnum_v3f16_nnan:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_pk_min_f16 v0, v0, v2
+; GFX900-GISEL-NEXT:    v_pk_min_f16 v1, v1, v3
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-SDAG-LABEL: v_minimumnum_v3f16_nnan:
+; GFX950-SDAG:       ; %bb.0:
+; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_pk_min_f16 v1, v1, v3
+; GFX950-SDAG-NEXT:    v_pk_min_f16 v0, v0, v2
+; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-GISEL-LABEL: v_minimumnum_v3f16_nnan:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_pk_min_f16 v0, v0, v2
-; GFX9-GISEL-NEXT:    v_pk_min_f16 v1, v1, v3
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX950-GISEL-LABEL: v_minimumnum_v3f16_nnan:
+; GFX950-GISEL:       ; %bb.0:
+; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_pk_min_f16 v0, v0, v2
+; GFX950-GISEL-NEXT:    v_pk_min_f16 v1, v1, v3
+; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimumnum_v3f16_nnan:
 ; GFX10:       ; %bb.0:
@@ -3982,12 +4268,19 @@ define <4 x half> @v_minimumnum_v4f16_nnan(<4 x half> %x, <4 x half> %y) {
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v1, v2, v1
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_minimumnum_v4f16_nnan:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_pk_min_f16 v0, v0, v2
-; GFX9-NEXT:    v_pk_min_f16 v1, v1, v3
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_minimumnum_v4f16_nnan:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_pk_min_f16 v0, v0, v2
+; GFX900-NEXT:    v_pk_min_f16 v1, v1, v3
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_minimumnum_v4f16_nnan:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_pk_min_f16 v0, v0, v2
+; GFX950-NEXT:    v_pk_min_f16 v1, v1, v3
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimumnum_v4f16_nnan:
 ; GFX10:       ; %bb.0:
@@ -6516,27 +6809,49 @@ define <2 x float> @v_minimumnum_v2f32(<2 x float> %x, <2 x float> %y) {
 ; GFX8-GISEL-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-SDAG-LABEL: v_minimumnum_v2f32:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX9-SDAG-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX9-SDAG-NEXT:    v_min_f32_e32 v0, v0, v2
-; GFX9-SDAG-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX9-SDAG-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX9-SDAG-NEXT:    v_min_f32_e32 v1, v1, v2
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: v_minimumnum_v2f32:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX9-GISEL-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX9-GISEL-NEXT:    v_min_f32_e32 v0, v0, v2
-; GFX9-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX9-GISEL-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX9-GISEL-NEXT:    v_min_f32_e32 v1, v1, v2
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-SDAG-LABEL: v_minimumnum_v2f32:
+; GFX900-SDAG:       ; %bb.0:
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-SDAG-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX900-SDAG-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX900-SDAG-NEXT:    v_min_f32_e32 v0, v0, v2
+; GFX900-SDAG-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX900-SDAG-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX900-SDAG-NEXT:    v_min_f32_e32 v1, v1, v2
+; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-GISEL-LABEL: v_minimumnum_v2f32:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX900-GISEL-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX900-GISEL-NEXT:    v_min_f32_e32 v0, v0, v2
+; GFX900-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX900-GISEL-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX900-GISEL-NEXT:    v_min_f32_e32 v1, v1, v2
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-SDAG-LABEL: v_minimumnum_v2f32:
+; GFX950-SDAG:       ; %bb.0:
+; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX950-SDAG-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX950-SDAG-NEXT:    v_min_f32_e32 v0, v0, v2
+; GFX950-SDAG-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX950-SDAG-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX950-SDAG-NEXT:    v_min_f32_e32 v1, v1, v2
+; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-GISEL-LABEL: v_minimumnum_v2f32:
+; GFX950-GISEL:       ; %bb.0:
+; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX950-GISEL-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX950-GISEL-NEXT:    v_min_f32_e32 v0, v0, v2
+; GFX950-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX950-GISEL-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX950-GISEL-NEXT:    v_min_f32_e32 v1, v1, v2
+; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_minimumnum_v2f32:
 ; GFX10-SDAG:       ; %bb.0:
@@ -6622,12 +6937,19 @@ define <2 x float> @v_minimumnum_v2f32_nnan(<2 x float> %x, <2 x float> %y) {
 ; GFX8-NEXT:    v_min_f32_e32 v1, v1, v3
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_minimumnum_v2f32_nnan:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_min_f32_e32 v0, v0, v2
-; GFX9-NEXT:    v_min_f32_e32 v1, v1, v3
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_minimumnum_v2f32_nnan:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_min_f32_e32 v0, v0, v2
+; GFX900-NEXT:    v_min_f32_e32 v1, v1, v3
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_minimumnum_v2f32_nnan:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_min_f32_e32 v0, v0, v2
+; GFX950-NEXT:    v_min_f32_e32 v1, v1, v3
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimumnum_v2f32_nnan:
 ; GFX10:       ; %bb.0:
@@ -6712,33 +7034,61 @@ define <3 x float> @v_minimumnum_v3f32(<3 x float> %x, <3 x float> %y) {
 ; GFX8-GISEL-NEXT:    v_min_f32_e32 v2, v2, v3
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-SDAG-LABEL: v_minimumnum_v3f32:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_max_f32_e32 v3, v3, v3
-; GFX9-SDAG-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX9-SDAG-NEXT:    v_min_f32_e32 v0, v0, v3
-; GFX9-SDAG-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX9-SDAG-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX9-SDAG-NEXT:    v_min_f32_e32 v1, v1, v3
-; GFX9-SDAG-NEXT:    v_max_f32_e32 v3, v5, v5
-; GFX9-SDAG-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX9-SDAG-NEXT:    v_min_f32_e32 v2, v2, v3
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: v_minimumnum_v3f32:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX9-GISEL-NEXT:    v_max_f32_e32 v3, v3, v3
-; GFX9-GISEL-NEXT:    v_min_f32_e32 v0, v0, v3
-; GFX9-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX9-GISEL-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX9-GISEL-NEXT:    v_min_f32_e32 v1, v1, v3
-; GFX9-GISEL-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX9-GISEL-NEXT:    v_max_f32_e32 v3, v5, v5
-; GFX9-GISEL-NEXT:    v_min_f32_e32 v2, v2, v3
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-SDAG-LABEL: v_minimumnum_v3f32:
+; GFX900-SDAG:       ; %bb.0:
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-SDAG-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX900-SDAG-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX900-SDAG-NEXT:    v_min_f32_e32 v0, v0, v3
+; GFX900-SDAG-NEXT:    v_max_f32_e32 v3, v4, v4
+; GFX900-SDAG-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX900-SDAG-NEXT:    v_min_f32_e32 v1, v1, v3
+; GFX900-SDAG-NEXT:    v_max_f32_e32 v3, v5, v5
+; GFX900-SDAG-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX900-SDAG-NEXT:    v_min_f32_e32 v2, v2, v3
+; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-GISEL-LABEL: v_minimumnum_v3f32:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX900-GISEL-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX900-GISEL-NEXT:    v_min_f32_e32 v0, v0, v3
+; GFX900-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX900-GISEL-NEXT:    v_max_f32_e32 v3, v4, v4
+; GFX900-GISEL-NEXT:    v_min_f32_e32 v1, v1, v3
+; GFX900-GISEL-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX900-GISEL-NEXT:    v_max_f32_e32 v3, v5, v5
+; GFX900-GISEL-NEXT:    v_min_f32_e32 v2, v2, v3
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-SDAG-LABEL: v_minimumnum_v3f32:
+; GFX950-SDAG:       ; %bb.0:
+; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX950-SDAG-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX950-SDAG-NEXT:    v_min_f32_e32 v0, v0, v3
+; GFX950-SDAG-NEXT:    v_max_f32_e32 v3, v4, v4
+; GFX950-SDAG-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX950-SDAG-NEXT:    v_min_f32_e32 v1, v1, v3
+; GFX950-SDAG-NEXT:    v_max_f32_e32 v3, v5, v5
+; GFX950-SDAG-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX950-SDAG-NEXT:    v_min_f32_e32 v2, v2, v3
+; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-GISEL-LABEL: v_minimumnum_v3f32:
+; GFX950-GISEL:       ; %bb.0:
+; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX950-GISEL-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX950-GISEL-NEXT:    v_min_f32_e32 v0, v0, v3
+; GFX950-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX950-GISEL-NEXT:    v_max_f32_e32 v3, v4, v4
+; GFX950-GISEL-NEXT:    v_min_f32_e32 v1, v1, v3
+; GFX950-GISEL-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX950-GISEL-NEXT:    v_max_f32_e32 v3, v5, v5
+; GFX950-GISEL-NEXT:    v_min_f32_e32 v2, v2, v3
+; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_minimumnum_v3f32:
 ; GFX10-SDAG:       ; %bb.0:
@@ -6840,13 +7190,21 @@ define <3 x float> @v_minimumnum_v3f32_nnan(<3 x float> %x, <3 x float> %y) {
 ; GFX8-NEXT:    v_min_f32_e32 v2, v2, v5
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_minimumnum_v3f32_nnan:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_min_f32_e32 v0, v0, v3
-; GFX9-NEXT:    v_min_f32_e32 v1, v1, v4
-; GFX9-NEXT:    v_min_f32_e32 v2, v2, v5
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_minimumnum_v3f32_nnan:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_min_f32_e32 v0, v0, v3
+; GFX900-NEXT:    v_min_f32_e32 v1, v1, v4
+; GFX900-NEXT:    v_min_f32_e32 v2, v2, v5
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_minimumnum_v3f32_nnan:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_min_f32_e32 v0, v0, v3
+; GFX950-NEXT:    v_min_f32_e32 v1, v1, v4
+; GFX950-NEXT:    v_min_f32_e32 v2, v2, v5
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimumnum_v3f32_nnan:
 ; GFX10:       ; %bb.0:
@@ -6946,39 +7304,73 @@ define <4 x float> @v_minimumnum_v4f32(<4 x float> %x, <4 x float> %y) {
 ; GFX8-GISEL-NEXT:    v_min_f32_e32 v3, v3, v4
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-SDAG-LABEL: v_minimumnum_v4f32:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_max_f32_e32 v4, v4, v4
-; GFX9-SDAG-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX9-SDAG-NEXT:    v_min_f32_e32 v0, v0, v4
-; GFX9-SDAG-NEXT:    v_max_f32_e32 v4, v5, v5
-; GFX9-SDAG-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX9-SDAG-NEXT:    v_min_f32_e32 v1, v1, v4
-; GFX9-SDAG-NEXT:    v_max_f32_e32 v4, v6, v6
-; GFX9-SDAG-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX9-SDAG-NEXT:    v_min_f32_e32 v2, v2, v4
-; GFX9-SDAG-NEXT:    v_max_f32_e32 v4, v7, v7
-; GFX9-SDAG-NEXT:    v_max_f32_e32 v3, v3, v3
-; GFX9-SDAG-NEXT:    v_min_f32_e32 v3, v3, v4
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: v_minimumnum_v4f32:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX9-GISEL-NEXT:    v_max_f32_e32 v4, v4, v4
-; GFX9-GISEL-NEXT:    v_min_f32_e32 v0, v0, v4
-; GFX9-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX9-GISEL-NEXT:    v_max_f32_e32 v4, v5, v5
-; GFX9-GISEL-NEXT:    v_min_f32_e32 v1, v1, v4
-; GFX9-GISEL-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX9-GISEL-NEXT:    v_max_f32_e32 v4, v6, v6
-; GFX9-GISEL-NEXT:    v_min_f32_e32 v2, v2, v4
-; GFX9-GISEL-NEXT:    v_max_f32_e32 v3, v3, v3
-; GFX9-GISEL-NEXT:    v_max_f32_e32 v4, v7, v7
-; GFX9-GISEL-NEXT:    v_min_f32_e32 v3, v3, v4
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-SDAG-LABEL: v_minimumnum_v4f32:
+; GFX900-SDAG:       ; %bb.0:
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-SDAG-NEXT:    v_max_f32_e32 v4, v4, v4
+; GFX900-SDAG-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX900-SDAG-NEXT:    v_min_f32_e32 v0, v0, v4
+; GFX900-SDAG-NEXT:    v_max_f32_e32 v4, v5, v5
+; GFX900-SDAG-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX900-SDAG-NEXT:    v_min_f32_e32 v1, v1, v4
+; GFX900-SDAG-NEXT:    v_max_f32_e32 v4, v6, v6
+; GFX900-SDAG-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX900-SDAG-NEXT:    v_min_f32_e32 v2, v2, v4
+; GFX900-SDAG-NEXT:    v_max_f32_e32 v4, v7, v7
+; GFX900-SDAG-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX900-SDAG-NEXT:    v_min_f32_e32 v3, v3, v4
+; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-GISEL-LABEL: v_minimumnum_v4f32:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX900-GISEL-NEXT:    v_max_f32_e32 v4, v4, v4
+; GFX900-GISEL-NEXT:    v_min_f32_e32 v0, v0, v4
+; GFX900-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX900-GISEL-NEXT:    v_max_f32_e32 v4, v5, v5
+; GFX900-GISEL-NEXT:    v_min_f32_e32 v1, v1, v4
+; GFX900-GISEL-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX900-GISEL-NEXT:    v_max_f32_e32 v4, v6, v6
+; GFX900-GISEL-NEXT:    v_min_f32_e32 v2, v2, v4
+; GFX900-GISEL-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX900-GISEL-NEXT:    v_max_f32_e32 v4, v7, v7
+; GFX900-GISEL-NEXT:    v_min_f32_e32 v3, v3, v4
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-SDAG-LABEL: v_minimumnum_v4f32:
+; GFX950-SDAG:       ; %bb.0:
+; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_max_f32_e32 v4, v4, v4
+; GFX950-SDAG-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX950-SDAG-NEXT:    v_min_f32_e32 v0, v0, v4
+; GFX950-SDAG-NEXT:    v_max_f32_e32 v4, v5, v5
+; GFX950-SDAG-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX950-SDAG-NEXT:    v_min_f32_e32 v1, v1, v4
+; GFX950-SDAG-NEXT:    v_max_f32_e32 v4, v6, v6
+; GFX950-SDAG-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX950-SDAG-NEXT:    v_min_f32_e32 v2, v2, v4
+; GFX950-SDAG-NEXT:    v_max_f32_e32 v4, v7, v7
+; GFX950-SDAG-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX950-SDAG-NEXT:    v_min_f32_e32 v3, v3, v4
+; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-GISEL-LABEL: v_minimumnum_v4f32:
+; GFX950-GISEL:       ; %bb.0:
+; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX950-GISEL-NEXT:    v_max_f32_e32 v4, v4, v4
+; GFX950-GISEL-NEXT:    v_min_f32_e32 v0, v0, v4
+; GFX950-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX950-GISEL-NEXT:    v_max_f32_e32 v4, v5, v5
+; GFX950-GISEL-NEXT:    v_min_f32_e32 v1, v1, v4
+; GFX950-GISEL-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX950-GISEL-NEXT:    v_max_f32_e32 v4, v6, v6
+; GFX950-GISEL-NEXT:    v_min_f32_e32 v2, v2, v4
+; GFX950-GISEL-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX950-GISEL-NEXT:    v_max_f32_e32 v4, v7, v7
+; GFX950-GISEL-NEXT:    v_min_f32_e32 v3, v3, v4
+; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_minimumnum_v4f32:
 ; GFX10-SDAG:       ; %bb.0:
@@ -7092,14 +7484,23 @@ define <4 x float> @v_minimumnum_v4f32_nnan(<4 x float> %x, <4 x float> %y) {
 ; GFX8-NEXT:    v_min_f32_e32 v3, v3, v7
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_minimumnum_v4f32_nnan:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_min_f32_e32 v0, v0, v4
-; GFX9-NEXT:    v_min_f32_e32 v1, v1, v5
-; GFX9-NEXT:    v_min_f32_e32 v2, v2, v6
-; GFX9-NEXT:    v_min_f32_e32 v3, v3, v7
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_minimumnum_v4f32_nnan:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_min_f32_e32 v0, v0, v4
+; GFX900-NEXT:    v_min_f32_e32 v1, v1, v5
+; GFX900-NEXT:    v_min_f32_e32 v2, v2, v6
+; GFX900-NEXT:    v_min_f32_e32 v3, v3, v7
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_minimumnum_v4f32_nnan:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_min_f32_e32 v0, v0, v4
+; GFX950-NEXT:    v_min_f32_e32 v1, v1, v5
+; GFX950-NEXT:    v_min_f32_e32 v2, v2, v6
+; GFX950-NEXT:    v_min_f32_e32 v3, v3, v7
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimumnum_v4f32_nnan:
 ; GFX10:       ; %bb.0:
@@ -7201,12 +7602,12 @@ define <2 x double> @v_minimumnum_v2f64(<2 x double> %x, <2 x double> %y) {
 ; GFX950-SDAG-LABEL: v_minimumnum_v2f64:
 ; GFX950-SDAG:       ; %bb.0:
 ; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX950-SDAG-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX950-SDAG-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
 ; GFX950-SDAG-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX950-SDAG-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
 ; GFX950-SDAG-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
-; GFX950-SDAG-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX950-SDAG-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX950-SDAG-NEXT:    v_min_f64 v[2:3], v[2:3], v[4:5]
 ; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX950-GISEL-LABEL: v_minimumnum_v2f64:
@@ -7316,12 +7717,26 @@ define <2 x double> @v_minimumnum_v2f64_nnan(<2 x double> %x, <2 x double> %y) {
 ; GFX8-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_minimumnum_v2f64_nnan:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
-; GFX9-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_minimumnum_v2f64_nnan:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
+; GFX900-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-SDAG-LABEL: v_minimumnum_v2f64_nnan:
+; GFX950-SDAG:       ; %bb.0:
+; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
+; GFX950-SDAG-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
+; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-GISEL-LABEL: v_minimumnum_v2f64_nnan:
+; GFX950-GISEL:       ; %bb.0:
+; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
+; GFX950-GISEL-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
+; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimumnum_v2f64_nnan:
 ; GFX10:       ; %bb.0:
@@ -7439,15 +7854,15 @@ define <3 x double> @v_minimumnum_v3f64(<3 x double> %x, <3 x double> %y) {
 ; GFX950-SDAG-LABEL: v_minimumnum_v3f64:
 ; GFX950-SDAG:       ; %bb.0:
 ; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
+; GFX950-SDAG-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX950-SDAG-NEXT:    v_max_f64 v[8:9], v[8:9], v[8:9]
+; GFX950-SDAG-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX950-SDAG-NEXT:    v_max_f64 v[6:7], v[6:7], v[6:7]
 ; GFX950-SDAG-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX950-SDAG-NEXT:    v_min_f64 v[4:5], v[4:5], v[10:11]
+; GFX950-SDAG-NEXT:    v_min_f64 v[2:3], v[2:3], v[8:9]
 ; GFX950-SDAG-NEXT:    v_min_f64 v[0:1], v[0:1], v[6:7]
-; GFX950-SDAG-NEXT:    v_max_f64 v[6:7], v[8:9], v[8:9]
-; GFX950-SDAG-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX950-SDAG-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
-; GFX950-SDAG-NEXT:    v_max_f64 v[6:7], v[10:11], v[10:11]
-; GFX950-SDAG-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
-; GFX950-SDAG-NEXT:    v_min_f64 v[4:5], v[4:5], v[6:7]
 ; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX950-GISEL-LABEL: v_minimumnum_v3f64:
@@ -7580,13 +7995,29 @@ define <3 x double> @v_minimumnum_v3f64_nnan(<3 x double> %x, <3 x double> %y) {
 ; GFX8-NEXT:    v_min_f64 v[4:5], v[4:5], v[10:11]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_minimumnum_v3f64_nnan:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_min_f64 v[0:1], v[0:1], v[6:7]
-; GFX9-NEXT:    v_min_f64 v[2:3], v[2:3], v[8:9]
-; GFX9-NEXT:    v_min_f64 v[4:5], v[4:5], v[10:11]
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_minimumnum_v3f64_nnan:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_min_f64 v[0:1], v[0:1], v[6:7]
+; GFX900-NEXT:    v_min_f64 v[2:3], v[2:3], v[8:9]
+; GFX900-NEXT:    v_min_f64 v[4:5], v[4:5], v[10:11]
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-SDAG-LABEL: v_minimumnum_v3f64_nnan:
+; GFX950-SDAG:       ; %bb.0:
+; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_min_f64 v[4:5], v[4:5], v[10:11]
+; GFX950-SDAG-NEXT:    v_min_f64 v[2:3], v[2:3], v[8:9]
+; GFX950-SDAG-NEXT:    v_min_f64 v[0:1], v[0:1], v[6:7]
+; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-GISEL-LABEL: v_minimumnum_v3f64_nnan:
+; GFX950-GISEL:       ; %bb.0:
+; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_min_f64 v[0:1], v[0:1], v[6:7]
+; GFX950-GISEL-NEXT:    v_min_f64 v[2:3], v[2:3], v[8:9]
+; GFX950-GISEL-NEXT:    v_min_f64 v[4:5], v[4:5], v[10:11]
+; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimumnum_v3f64_nnan:
 ; GFX10:       ; %bb.0:
@@ -7725,18 +8156,18 @@ define <4 x double> @v_minimumnum_v4f64(<4 x double> %x, <4 x double> %y) {
 ; GFX950-SDAG-LABEL: v_minimumnum_v4f64:
 ; GFX950-SDAG:       ; %bb.0:
 ; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_max_f64 v[14:15], v[14:15], v[14:15]
+; GFX950-SDAG-NEXT:    v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX950-SDAG-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
+; GFX950-SDAG-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX950-SDAG-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
+; GFX950-SDAG-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX950-SDAG-NEXT:    v_max_f64 v[8:9], v[8:9], v[8:9]
 ; GFX950-SDAG-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX950-SDAG-NEXT:    v_min_f64 v[6:7], v[6:7], v[14:15]
+; GFX950-SDAG-NEXT:    v_min_f64 v[4:5], v[4:5], v[12:13]
+; GFX950-SDAG-NEXT:    v_min_f64 v[2:3], v[2:3], v[10:11]
 ; GFX950-SDAG-NEXT:    v_min_f64 v[0:1], v[0:1], v[8:9]
-; GFX950-SDAG-NEXT:    v_max_f64 v[8:9], v[10:11], v[10:11]
-; GFX950-SDAG-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX950-SDAG-NEXT:    v_min_f64 v[2:3], v[2:3], v[8:9]
-; GFX950-SDAG-NEXT:    v_max_f64 v[8:9], v[12:13], v[12:13]
-; GFX950-SDAG-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
-; GFX950-SDAG-NEXT:    v_min_f64 v[4:5], v[4:5], v[8:9]
-; GFX950-SDAG-NEXT:    v_max_f64 v[8:9], v[14:15], v[14:15]
-; GFX950-SDAG-NEXT:    v_max_f64 v[6:7], v[6:7], v[6:7]
-; GFX950-SDAG-NEXT:    v_min_f64 v[6:7], v[6:7], v[8:9]
 ; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX950-GISEL-LABEL: v_minimumnum_v4f64:
@@ -7892,14 +8323,32 @@ define <4 x double> @v_minimumnum_v4f64_nnan(<4 x double> %x, <4 x double> %y) {
 ; GFX8-NEXT:    v_min_f64 v[6:7], v[6:7], v[14:15]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_minimumnum_v4f64_nnan:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_min_f64 v[0:1], v[0:1], v[8:9]
-; GFX9-NEXT:    v_min_f64 v[2:3], v[2:3], v[10:11]
-; GFX9-NEXT:    v_min_f64 v[4:5], v[4:5], v[12:13]
-; GFX9-NEXT:    v_min_f64 v[6:7], v[6:7], v[14:15]
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_minimumnum_v4f64_nnan:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_min_f64 v[0:1], v[0:1], v[8:9]
+; GFX900-NEXT:    v_min_f64 v[2:3], v[2:3], v[10:11]
+; GFX900-NEXT:    v_min_f64 v[4:5], v[4:5], v[12:13]
+; GFX900-NEXT:    v_min_f64 v[6:7], v[6:7], v[14:15]
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-SDAG-LABEL: v_minimumnum_v4f64_nnan:
+; GFX950-SDAG:       ; %bb.0:
+; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_min_f64 v[6:7], v[6:7], v[14:15]
+; GFX950-SDAG-NEXT:    v_min_f64 v[4:5], v[4:5], v[12:13]
+; GFX950-SDAG-NEXT:    v_min_f64 v[2:3], v[2:3], v[10:11]
+; GFX950-SDAG-NEXT:    v_min_f64 v[0:1], v[0:1], v[8:9]
+; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-GISEL-LABEL: v_minimumnum_v4f64_nnan:
+; GFX950-GISEL:       ; %bb.0:
+; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_min_f64 v[0:1], v[0:1], v[8:9]
+; GFX950-GISEL-NEXT:    v_min_f64 v[2:3], v[2:3], v[10:11]
+; GFX950-GISEL-NEXT:    v_min_f64 v[4:5], v[4:5], v[12:13]
+; GFX950-GISEL-NEXT:    v_min_f64 v[6:7], v[6:7], v[14:15]
+; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimumnum_v4f64_nnan:
 ; GFX10:       ; %bb.0:
@@ -7961,11 +8410,17 @@ define half @v_minimumnum_f16_no_ieee(half %x, half %y) #0 {
 ; GFX8-NEXT:    v_min_f16_e32 v0, v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_minimumnum_f16_no_ieee:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_min_f16_e32 v0, v0, v1
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_minimumnum_f16_no_ieee:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_minimumnum_f16_no_ieee:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimumnum_f16_no_ieee:
 ; GFX10:       ; %bb.0:
@@ -8066,11 +8521,17 @@ define half @v_minimumnum_f16_nan_no_ieee(half %x, half %y) #0 {
 ; GFX8-NEXT:    v_min_f16_e32 v0, v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_minimumnum_f16_nan_no_ieee:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_min_f16_e32 v0, v0, v1
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_minimumnum_f16_nan_no_ieee:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_minimumnum_f16_nan_no_ieee:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimumnum_f16_nan_no_ieee:
 ; GFX10:       ; %bb.0:
@@ -8126,11 +8587,17 @@ define float @v_minimumnum_f32_no_ieee(float %x, float %y) #0 {
 ; GFX8-NEXT:    v_min_f32_e32 v0, v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_minimumnum_f32_no_ieee:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_minimumnum_f32_no_ieee:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_minimumnum_f32_no_ieee:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimumnum_f32_no_ieee:
 ; GFX10:       ; %bb.0:
@@ -8184,11 +8651,17 @@ define float @v_minimumnum_f32_nnan_no_ieee(float %x, float %y) #0 {
 ; GFX8-NEXT:    v_min_f32_e32 v0, v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_minimumnum_f32_nnan_no_ieee:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_minimumnum_f32_nnan_no_ieee:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_minimumnum_f32_nnan_no_ieee:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimumnum_f32_nnan_no_ieee:
 ; GFX10:       ; %bb.0:
@@ -8228,11 +8701,17 @@ define double @v_minimumnum_f64_no_ieee(double %x, double %y) #0 {
 ; GFX8-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_minimumnum_f64_no_ieee:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_minimumnum_f64_no_ieee:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_minimumnum_f64_no_ieee:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimumnum_f64_no_ieee:
 ; GFX10:       ; %bb.0:
@@ -8288,11 +8767,17 @@ define double @v_minimumnum_f64_nnan_no_ieee(double %x, double %y) #0 {
 ; GFX8-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_minimumnum_f64_nnan_no_ieee:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_minimumnum_f64_nnan_no_ieee:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_minimumnum_f64_nnan_no_ieee:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimumnum_f64_nnan_no_ieee:
 ; GFX10:       ; %bb.0:
@@ -8364,11 +8849,17 @@ define <2 x half> @v_minimumnum_v2f16_no_ieee(<2 x half> %x, <2 x half> %y) #0 {
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v2, v0
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_minimumnum_v2f16_no_ieee:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_pk_min_f16 v0, v0, v1
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_minimumnum_v2f16_no_ieee:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_pk_min_f16 v0, v0, v1
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_minimumnum_v2f16_no_ieee:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_pk_min_f16 v0, v0, v1
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimumnum_v2f16_no_ieee:
 ; GFX10:       ; %bb.0:
@@ -8456,11 +8947,17 @@ define <2 x half> @v_minimumnum_v2f16_nnan_no_ieee(<2 x half> %x, <2 x half> %y)
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v2, v0
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_minimumnum_v2f16_nnan_no_ieee:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_pk_min_f16 v0, v0, v1
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_minimumnum_v2f16_nnan_no_ieee:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_pk_min_f16 v0, v0, v1
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_minimumnum_v2f16_nnan_no_ieee:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_pk_min_f16 v0, v0, v1
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimumnum_v2f16_nnan_no_ieee:
 ; GFX10:       ; %bb.0:
@@ -8543,19 +9040,33 @@ define <3 x half> @v_minimumnum_v3f16_nnan_no_ieee(<3 x half> %x, <3 x half> %y)
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v4, v0
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-SDAG-LABEL: v_minimumnum_v3f16_nnan_no_ieee:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_pk_min_f16 v1, v1, v3
-; GFX9-SDAG-NEXT:    v_pk_min_f16 v0, v0, v2
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-SDAG-LABEL: v_minimumnum_v3f16_nnan_no_ieee:
+; GFX900-SDAG:       ; %bb.0:
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-SDAG-NEXT:    v_pk_min_f16 v1, v1, v3
+; GFX900-SDAG-NEXT:    v_pk_min_f16 v0, v0, v2
+; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-GISEL-LABEL: v_minimumnum_v3f16_nnan_no_ieee:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_pk_min_f16 v0, v0, v2
+; GFX900-GISEL-NEXT:    v_pk_min_f16 v1, v1, v3
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-SDAG-LABEL: v_minimumnum_v3f16_nnan_no_ieee:
+; GFX950-SDAG:       ; %bb.0:
+; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_pk_min_f16 v1, v1, v3
+; GFX950-SDAG-NEXT:    v_pk_min_f16 v0, v0, v2
+; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-GISEL-LABEL: v_minimumnum_v3f16_nnan_no_ieee:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_pk_min_f16 v0, v0, v2
-; GFX9-GISEL-NEXT:    v_pk_min_f16 v1, v1, v3
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX950-GISEL-LABEL: v_minimumnum_v3f16_nnan_no_ieee:
+; GFX950-GISEL:       ; %bb.0:
+; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_pk_min_f16 v0, v0, v2
+; GFX950-GISEL-NEXT:    v_pk_min_f16 v1, v1, v3
+; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimumnum_v3f16_nnan_no_ieee:
 ; GFX10:       ; %bb.0:
@@ -8654,12 +9165,19 @@ define <4 x half> @v_minimumnum_v4f16_nnan_no_ieee(<4 x half> %x, <4 x half> %y)
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v1, v2, v1
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_minimumnum_v4f16_nnan_no_ieee:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_pk_min_f16 v0, v0, v2
-; GFX9-NEXT:    v_pk_min_f16 v1, v1, v3
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_minimumnum_v4f16_nnan_no_ieee:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_pk_min_f16 v0, v0, v2
+; GFX900-NEXT:    v_pk_min_f16 v1, v1, v3
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_minimumnum_v4f16_nnan_no_ieee:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_pk_min_f16 v0, v0, v2
+; GFX950-NEXT:    v_pk_min_f16 v1, v1, v3
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimumnum_v4f16_nnan_no_ieee:
 ; GFX10:       ; %bb.0:
@@ -8690,6 +9208,3 @@ define <4 x half> @v_minimumnum_v4f16_nnan_no_ieee(<4 x half> %x, <4 x half> %y)
 }
 
 attributes #0 = { "amdgpu-ieee"="false" }
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; GFX900: {{.*}}
-; GFX950: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
index bef38c1a65ef8..a338bcbd3ba5c 100644
--- a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
+++ b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
@@ -181,52 +181,52 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
 ; GFX900-NEXT:    global_store_dwordx4 v0, v[1:4], s[0:1] offset:16
 ; GFX900-NEXT:    s_endpgm
 ;
-; PACKED-SDAG-LABEL: fadd_v32_vs:
-; PACKED-SDAG:       ; %bb.0:
-; PACKED-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; PACKED-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; PACKED-SDAG-NEXT:    v_lshlrev_b32_e32 v32, 7, v0
-; PACKED-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; PACKED-SDAG-NEXT:    global_load_dwordx4 v[0:3], v32, s[0:1] offset:16
-; PACKED-SDAG-NEXT:    global_load_dwordx4 v[4:7], v32, s[0:1]
-; PACKED-SDAG-NEXT:    global_load_dwordx4 v[8:11], v32, s[0:1] offset:48
-; PACKED-SDAG-NEXT:    global_load_dwordx4 v[16:19], v32, s[0:1] offset:32
-; PACKED-SDAG-NEXT:    global_load_dwordx4 v[20:23], v32, s[0:1] offset:80
-; PACKED-SDAG-NEXT:    global_load_dwordx4 v[12:15], v32, s[0:1] offset:64
-; PACKED-SDAG-NEXT:    global_load_dwordx4 v[24:27], v32, s[0:1] offset:112
-; PACKED-SDAG-NEXT:    global_load_dwordx4 v[28:31], v32, s[0:1] offset:96
-; PACKED-SDAG-NEXT:    s_load_dwordx16 s[36:51], s[4:5], 0xa4
-; PACKED-SDAG-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0xe4
-; PACKED-SDAG-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(0)
-; PACKED-SDAG-NEXT:    v_pk_add_f32 v[0:1], v[0:1], s[40:41]
-; PACKED-SDAG-NEXT:    v_pk_add_f32 v[2:3], v[2:3], s[42:43]
-; PACKED-SDAG-NEXT:    s_waitcnt vmcnt(6)
-; PACKED-SDAG-NEXT:    v_pk_add_f32 v[6:7], v[6:7], s[38:39]
-; PACKED-SDAG-NEXT:    s_waitcnt vmcnt(5)
-; PACKED-SDAG-NEXT:    v_pk_add_f32 v[8:9], v[8:9], s[48:49]
-; PACKED-SDAG-NEXT:    v_pk_add_f32 v[10:11], v[10:11], s[50:51]
-; PACKED-SDAG-NEXT:    s_waitcnt vmcnt(4)
-; PACKED-SDAG-NEXT:    v_pk_add_f32 v[16:17], v[16:17], s[44:45]
-; PACKED-SDAG-NEXT:    v_pk_add_f32 v[18:19], v[18:19], s[46:47]
-; PACKED-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; PACKED-SDAG-NEXT:    v_pk_add_f32 v[28:29], v[28:29], s[16:17]
-; PACKED-SDAG-NEXT:    v_pk_add_f32 v[30:31], v[30:31], s[18:19]
-; PACKED-SDAG-NEXT:    v_pk_add_f32 v[20:21], v[20:21], s[12:13]
-; PACKED-SDAG-NEXT:    v_pk_add_f32 v[22:23], v[22:23], s[14:15]
-; PACKED-SDAG-NEXT:    v_pk_add_f32 v[14:15], v[14:15], s[10:11]
-; PACKED-SDAG-NEXT:    v_pk_add_f32 v[24:25], v[24:25], s[20:21]
-; PACKED-SDAG-NEXT:    v_pk_add_f32 v[26:27], v[26:27], s[22:23]
-; PACKED-SDAG-NEXT:    v_pk_add_f32 v[4:5], v[4:5], s[36:37]
-; PACKED-SDAG-NEXT:    v_pk_add_f32 v[12:13], v[12:13], s[8:9]
-; PACKED-SDAG-NEXT:    global_store_dwordx4 v32, v[28:31], s[0:1] offset:96
-; PACKED-SDAG-NEXT:    global_store_dwordx4 v32, v[24:27], s[0:1] offset:112
-; PACKED-SDAG-NEXT:    global_store_dwordx4 v32, v[12:15], s[0:1] offset:64
-; PACKED-SDAG-NEXT:    global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
-; PACKED-SDAG-NEXT:    global_store_dwordx4 v32, v[16:19], s[0:1] offset:32
-; PACKED-SDAG-NEXT:    global_store_dwordx4 v32, v[8:11], s[0:1] offset:48
-; PACKED-SDAG-NEXT:    global_store_dwordx4 v32, v[4:7], s[0:1]
-; PACKED-SDAG-NEXT:    global_store_dwordx4 v32, v[0:3], s[0:1] offset:16
-; PACKED-SDAG-NEXT:    s_endpgm
+; GFX90A-SDAG-LABEL: fadd_v32_vs:
+; GFX90A-SDAG:       ; %bb.0:
+; GFX90A-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX90A-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX90A-SDAG-NEXT:    v_lshlrev_b32_e32 v32, 7, v0
+; GFX90A-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-SDAG-NEXT:    global_load_dwordx4 v[0:3], v32, s[0:1] offset:16
+; GFX90A-SDAG-NEXT:    global_load_dwordx4 v[4:7], v32, s[0:1]
+; GFX90A-SDAG-NEXT:    global_load_dwordx4 v[8:11], v32, s[0:1] offset:48
+; GFX90A-SDAG-NEXT:    global_load_dwordx4 v[16:19], v32, s[0:1] offset:32
+; GFX90A-SDAG-NEXT:    global_load_dwordx4 v[20:23], v32, s[0:1] offset:80
+; GFX90A-SDAG-NEXT:    global_load_dwordx4 v[12:15], v32, s[0:1] offset:64
+; GFX90A-SDAG-NEXT:    global_load_dwordx4 v[24:27], v32, s[0:1] offset:112
+; GFX90A-SDAG-NEXT:    global_load_dwordx4 v[28:31], v32, s[0:1] offset:96
+; GFX90A-SDAG-NEXT:    s_load_dwordx16 s[36:51], s[4:5], 0xa4
+; GFX90A-SDAG-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0xe4
+; GFX90A-SDAG-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(0)
+; GFX90A-SDAG-NEXT:    v_pk_add_f32 v[0:1], v[0:1], s[40:41]
+; GFX90A-SDAG-NEXT:    v_pk_add_f32 v[2:3], v[2:3], s[42:43]
+; GFX90A-SDAG-NEXT:    s_waitcnt vmcnt(6)
+; GFX90A-SDAG-NEXT:    v_pk_add_f32 v[6:7], v[6:7], s[38:39]
+; GFX90A-SDAG-NEXT:    s_waitcnt vmcnt(5)
+; GFX90A-SDAG-NEXT:    v_pk_add_f32 v[8:9], v[8:9], s[48:49]
+; GFX90A-SDAG-NEXT:    v_pk_add_f32 v[10:11], v[10:11], s[50:51]
+; GFX90A-SDAG-NEXT:    s_waitcnt vmcnt(4)
+; GFX90A-SDAG-NEXT:    v_pk_add_f32 v[16:17], v[16:17], s[44:45]
+; GFX90A-SDAG-NEXT:    v_pk_add_f32 v[18:19], v[18:19], s[46:47]
+; GFX90A-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-SDAG-NEXT:    v_pk_add_f32 v[28:29], v[28:29], s[16:17]
+; GFX90A-SDAG-NEXT:    v_pk_add_f32 v[30:31], v[30:31], s[18:19]
+; GFX90A-SDAG-NEXT:    v_pk_add_f32 v[20:21], v[20:21], s[12:13]
+; GFX90A-SDAG-NEXT:    v_pk_add_f32 v[22:23], v[22:23], s[14:15]
+; GFX90A-SDAG-NEXT:    v_pk_add_f32 v[14:15], v[14:15], s[10:11]
+; GFX90A-SDAG-NEXT:    v_pk_add_f32 v[24:25], v[24:25], s[20:21]
+; GFX90A-SDAG-NEXT:    v_pk_add_f32 v[26:27], v[26:27], s[22:23]
+; GFX90A-SDAG-NEXT:    v_pk_add_f32 v[4:5], v[4:5], s[36:37]
+; GFX90A-SDAG-NEXT:    v_pk_add_f32 v[12:13], v[12:13], s[8:9]
+; GFX90A-SDAG-NEXT:    global_store_dwordx4 v32, v[28:31], s[0:1] offset:96
+; GFX90A-SDAG-NEXT:    global_store_dwordx4 v32, v[24:27], s[0:1] offset:112
+; GFX90A-SDAG-NEXT:    global_store_dwordx4 v32, v[12:15], s[0:1] offset:64
+; GFX90A-SDAG-NEXT:    global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
+; GFX90A-SDAG-NEXT:    global_store_dwordx4 v32, v[16:19], s[0:1] offset:32
+; GFX90A-SDAG-NEXT:    global_store_dwordx4 v32, v[8:11], s[0:1] offset:48
+; GFX90A-SDAG-NEXT:    global_store_dwordx4 v32, v[4:7], s[0:1]
+; GFX90A-SDAG-NEXT:    global_store_dwordx4 v32, v[0:3], s[0:1] offset:16
+; GFX90A-SDAG-NEXT:    s_endpgm
 ;
 ; PACKED-GISEL-LABEL: fadd_v32_vs:
 ; PACKED-GISEL:       ; %bb.0:
@@ -277,6 +277,56 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
 ; PACKED-GISEL-NEXT:    global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
 ; PACKED-GISEL-NEXT:    global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
 ; PACKED-GISEL-NEXT:    s_endpgm
+;
+; GFX942-SDAG-LABEL: fadd_v32_vs:
+; GFX942-SDAG:       ; %bb.0:
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX942-SDAG-NEXT:    v_lshlrev_b32_e32 v36, 7, v0
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    global_load_dwordx4 v[0:3], v36, s[0:1] offset:112
+; GFX942-SDAG-NEXT:    global_load_dwordx4 v[12:15], v36, s[0:1] offset:96
+; GFX942-SDAG-NEXT:    global_load_dwordx4 v[24:27], v36, s[0:1] offset:80
+; GFX942-SDAG-NEXT:    global_load_dwordx4 v[28:31], v36, s[0:1] offset:64
+; GFX942-SDAG-NEXT:    global_load_dwordx4 v[32:35], v36, s[0:1] offset:48
+; GFX942-SDAG-NEXT:    global_load_dwordx4 v[16:19], v36, s[0:1] offset:32
+; GFX942-SDAG-NEXT:    global_load_dwordx4 v[8:11], v36, s[0:1] offset:16
+; GFX942-SDAG-NEXT:    global_load_dwordx4 v[4:7], v36, s[0:1]
+; GFX942-SDAG-NEXT:    s_load_dwordx16 s[36:51], s[4:5], 0xe4
+; GFX942-SDAG-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0xa4
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_pk_add_f32 v[2:3], v[2:3], s[50:51]
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(6)
+; GFX942-SDAG-NEXT:    v_pk_add_f32 v[22:23], v[14:15], s[46:47]
+; GFX942-SDAG-NEXT:    v_pk_add_f32 v[20:21], v[12:13], s[44:45]
+; GFX942-SDAG-NEXT:    v_pk_add_f32 v[0:1], v[0:1], s[48:49]
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(5)
+; GFX942-SDAG-NEXT:    v_pk_add_f32 v[14:15], v[26:27], s[42:43]
+; GFX942-SDAG-NEXT:    v_pk_add_f32 v[12:13], v[24:25], s[40:41]
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(4)
+; GFX942-SDAG-NEXT:    v_pk_add_f32 v[26:27], v[30:31], s[38:39]
+; GFX942-SDAG-NEXT:    v_pk_add_f32 v[24:25], v[28:29], s[36:37]
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(3)
+; GFX942-SDAG-NEXT:    v_pk_add_f32 v[30:31], v[34:35], s[22:23]
+; GFX942-SDAG-NEXT:    v_pk_add_f32 v[28:29], v[32:33], s[20:21]
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(2)
+; GFX942-SDAG-NEXT:    v_pk_add_f32 v[18:19], v[18:19], s[18:19]
+; GFX942-SDAG-NEXT:    v_pk_add_f32 v[16:17], v[16:17], s[16:17]
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX942-SDAG-NEXT:    v_pk_add_f32 v[10:11], v[10:11], s[14:15]
+; GFX942-SDAG-NEXT:    v_pk_add_f32 v[8:9], v[8:9], s[12:13]
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-SDAG-NEXT:    v_pk_add_f32 v[6:7], v[6:7], s[10:11]
+; GFX942-SDAG-NEXT:    v_pk_add_f32 v[4:5], v[4:5], s[8:9]
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v36, v[20:23], s[0:1] offset:96
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v36, v[0:3], s[0:1] offset:112
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v36, v[24:27], s[0:1] offset:64
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v36, v[12:15], s[0:1] offset:80
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v36, v[16:19], s[0:1] offset:32
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v36, v[28:31], s[0:1] offset:48
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v36, v[4:7], s[0:1]
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v36, v[8:11], s[0:1] offset:16
+; GFX942-SDAG-NEXT:    s_endpgm
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr inbounds <32 x float>, ptr addrspace(1) %a, i32 %id
   %load = load <32 x float>, ptr addrspace(1) %gep, align 128
@@ -959,52 +1009,52 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
 ; GFX900-NEXT:    global_store_dwordx4 v0, v[1:4], s[0:1] offset:16
 ; GFX900-NEXT:    s_endpgm
 ;
-; PACKED-SDAG-LABEL: fmul_v32_vs:
-; PACKED-SDAG:       ; %bb.0:
-; PACKED-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; PACKED-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; PACKED-SDAG-NEXT:    v_lshlrev_b32_e32 v32, 7, v0
-; PACKED-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; PACKED-SDAG-NEXT:    global_load_dwordx4 v[0:3], v32, s[0:1] offset:16
-; PACKED-SDAG-NEXT:    global_load_dwordx4 v[4:7], v32, s[0:1]
-; PACKED-SDAG-NEXT:    global_load_dwordx4 v[8:11], v32, s[0:1] offset:48
-; PACKED-SDAG-NEXT:    global_load_dwordx4 v[16:19], v32, s[0:1] offset:32
-; PACKED-SDAG-NEXT:    global_load_dwordx4 v[20:23], v32, s[0:1] offset:80
-; PACKED-SDAG-NEXT:    global_load_dwordx4 v[12:15], v32, s[0:1] offset:64
-; PACKED-SDAG-NEXT:    global_load_dwordx4 v[24:27], v32, s[0:1] offset:112
-; PACKED-SDAG-NEXT:    global_load_dwordx4 v[28:31], v32, s[0:1] offset:96
-; PACKED-SDAG-NEXT:    s_load_dwordx16 s[36:51], s[4:5], 0xa4
-; PACKED-SDAG-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0xe4
-; PACKED-SDAG-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(0)
-; PACKED-SDAG-NEXT:    v_pk_mul_f32 v[0:1], v[0:1], s[40:41]
-; PACKED-SDAG-NEXT:    v_pk_mul_f32 v[2:3], v[2:3], s[42:43]
-; PACKED-SDAG-NEXT:    s_waitcnt vmcnt(6)
-; PACKED-SDAG-NEXT:    v_pk_mul_f32 v[6:7], v[6:7], s[38:39]
-; PACKED-SDAG-NEXT:    s_waitcnt vmcnt(5)
-; PACKED-SDAG-NEXT:    v_pk_mul_f32 v[8:9], v[8:9], s[48:49]
-; PACKED-SDAG-NEXT:    v_pk_mul_f32 v[10:11], v[10:11], s[50:51]
-; PACKED-SDAG-NEXT:    s_waitcnt vmcnt(4)
-; PACKED-SDAG-NEXT:    v_pk_mul_f32 v[16:17], v[16:17], s[44:45]
-; PACKED-SDAG-NEXT:    v_pk_mul_f32 v[18:19], v[18:19], s[46:47]
-; PACKED-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; PACKED-SDAG-NEXT:    v_pk_mul_f32 v[28:29], v[28:29], s[16:17]
-; PACKED-SDAG-NEXT:    v_pk_mul_f32 v[30:31], v[30:31], s[18:19]
-; PACKED-SDAG-NEXT:    v_pk_mul_f32 v[20:21], v[20:21], s[12:13]
-; PACKED-SDAG-NEXT:    v_pk_mul_f32 v[22:23], v[22:23], s[14:15]
-; PACKED-SDAG-NEXT:    v_pk_mul_f32 v[14:15], v[14:15], s[10:11]
-; PACKED-SDAG-NEXT:    v_pk_mul_f32 v[24:25], v[24:25], s[20:21]
-; PACKED-SDAG-NEXT:    v_pk_mul_f32 v[26:27], v[26:27], s[22:23]
-; PACKED-SDAG-NEXT:    v_pk_mul_f32 v[4:5], v[4:5], s[36:37]
-; PACKED-SDAG-NEXT:    v_pk_mul_f32 v[12:13], v[12:13], s[8:9]
-; PACKED-SDAG-NEXT:    global_store_dwordx4 v32, v[28:31], s[0:1] offset:96
-; PACKED-SDAG-NEXT:    global_store_dwordx4 v32, v[24:27], s[0:1] offset:112
-; PACKED-SDAG-NEXT:    global_store_dwordx4 v32, v[12:15], s[0:1] offset:64
-; PACKED-SDAG-NEXT:    global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
-; PACKED-SDAG-NEXT:    global_store_dwordx4 v32, v[16:19], s[0:1] offset:32
-; PACKED-SDAG-NEXT:    global_store_dwordx4 v32, v[8:11], s[0:1] offset:48
-; PACKED-SDAG-NEXT:    global_store_dwordx4 v32, v[4:7], s[0:1]
-; PACKED-SDAG-NEXT:    global_store_dwordx4 v32, v[0:3], s[0:1] offset:16
-; PACKED-SDAG-NEXT:    s_endpgm
+; GFX90A-SDAG-LABEL: fmul_v32_vs:
+; GFX90A-SDAG:       ; %bb.0:
+; GFX90A-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX90A-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX90A-SDAG-NEXT:    v_lshlrev_b32_e32 v32, 7, v0
+; GFX90A-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-SDAG-NEXT:    global_load_dwordx4 v[0:3], v32, s[0:1] offset:16
+; GFX90A-SDAG-NEXT:    global_load_dwordx4 v[4:7], v32, s[0:1]
+; GFX90A-SDAG-NEXT:    global_load_dwordx4 v[8:11], v32, s[0:1] offset:48
+; GFX90A-SDAG-NEXT:    global_load_dwordx4 v[16:19], v32, s[0:1] offset:32
+; GFX90A-SDAG-NEXT:    global_load_dwordx4 v[20:23], v32, s[0:1] offset:80
+; GFX90A-SDAG-NEXT:    global_load_dwordx4 v[12:15], v32, s[0:1] offset:64
+; GFX90A-SDAG-NEXT:    global_load_dwordx4 v[24:27], v32, s[0:1] offset:112
+; GFX90A-SDAG-NEXT:    global_load_dwordx4 v[28:31], v32, s[0:1] offset:96
+; GFX90A-SDAG-NEXT:    s_load_dwordx16 s[36:51], s[4:5], 0xa4
+; GFX90A-SDAG-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0xe4
+; GFX90A-SDAG-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(0)
+; GFX90A-SDAG-NEXT:    v_pk_mul_f32 v[0:1], v[0:1], s[40:41]
+; GFX90A-SDAG-NEXT:    v_pk_mul_f32 v[2:3], v[2:3], s[42:43]
+; GFX90A-SDAG-NEXT:    s_waitcnt vmcnt(6)
+; GFX90A-SDAG-NEXT:    v_pk_mul_f32 v[6:7], v[6:7], s[38:39]
+; GFX90A-SDAG-NEXT:    s_waitcnt vmcnt(5)
+; GFX90A-SDAG-NEXT:    v_pk_mul_f32 v[8:9], v[8:9], s[48:49]
+; GFX90A-SDAG-NEXT:    v_pk_mul_f32 v[10:11], v[10:11], s[50:51]
+; GFX90A-SDAG-NEXT:    s_waitcnt vmcnt(4)
+; GFX90A-SDAG-NEXT:    v_pk_mul_f32 v[16:17], v[16:17], s[44:45]
+; GFX90A-SDAG-NEXT:    v_pk_mul_f32 v[18:19], v[18:19], s[46:47]
+; GFX90A-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-SDAG-NEXT:    v_pk_mul_f32 v[28:29], v[28:29], s[16:17]
+; GFX90A-SDAG-NEXT:    v_pk_mul_f32 v[30:31], v[30:31], s[18:19]
+; GFX90A-SDAG-NEXT:    v_pk_mul_f32 v[20:21], v[20:21], s[12:13]
+; GFX90A-SDAG-NEXT:    v_pk_mul_f32 v[22:23], v[22:23], s[14:15]
+; GFX90A-SDAG-NEXT:    v_pk_mul_f32 v[14:15], v[14:15], s[10:11]
+; GFX90A-SDAG-NEXT:    v_pk_mul_f32 v[24:25], v[24:25], s[20:21]
+; GFX90A-SDAG-NEXT:    v_pk_mul_f32 v[26:27], v[26:27], s[22:23]
+; GFX90A-SDAG-NEXT:    v_pk_mul_f32 v[4:5], v[4:5], s[36:37]
+; GFX90A-SDAG-NEXT:    v_pk_mul_f32 v[12:13], v[12:13], s[8:9]
+; GFX90A-SDAG-NEXT:    global_store_dwordx4 v32, v[28:31], s[0:1] offset:96
+; GFX90A-SDAG-NEXT:    global_store_dwordx4 v32, v[24:27], s[0:1] offset:112
+; GFX90A-SDAG-NEXT:    global_store_dwordx4 v32, v[12:15], s[0:1] offset:64
+; GFX90A-SDAG-NEXT:    global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
+; GFX90A-SDAG-NEXT:    global_store_dwordx4 v32, v[16:19], s[0:1] offset:32
+; GFX90A-SDAG-NEXT:    global_store_dwordx4 v32, v[8:11], s[0:1] offset:48
+; GFX90A-SDAG-NEXT:    global_store_dwordx4 v32, v[4:7], s[0:1]
+; GFX90A-SDAG-NEXT:    global_store_dwordx4 v32, v[0:3], s[0:1] offset:16
+; GFX90A-SDAG-NEXT:    s_endpgm
 ;
 ; PACKED-GISEL-LABEL: fmul_v32_vs:
 ; PACKED-GISEL:       ; %bb.0:
@@ -1055,6 +1105,56 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
 ; PACKED-GISEL-NEXT:    global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
 ; PACKED-GISEL-NEXT:    global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
 ; PACKED-GISEL-NEXT:    s_endpgm
+;
+; GFX942-SDAG-LABEL: fmul_v32_vs:
+; GFX942-SDAG:       ; %bb.0:
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX942-SDAG-NEXT:    v_lshlrev_b32_e32 v36, 7, v0
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    global_load_dwordx4 v[0:3], v36, s[0:1] offset:112
+; GFX942-SDAG-NEXT:    global_load_dwordx4 v[12:15], v36, s[0:1] offset:96
+; GFX942-SDAG-NEXT:    global_load_dwordx4 v[24:27], v36, s[0:1] offset:80
+; GFX942-SDAG-NEXT:    global_load_dwordx4 v[28:31], v36, s[0:1] offset:64
+; GFX942-SDAG-NEXT:    global_load_dwordx4 v[32:35], v36, s[0:1] offset:48
+; GFX942-SDAG-NEXT:    global_load_dwordx4 v[16:19], v36, s[0:1] offset:32
+; GFX942-SDAG-NEXT:    global_load_dwordx4 v[8:11], v36, s[0:1] offset:16
+; GFX942-SDAG-NEXT:    global_load_dwordx4 v[4:7], v36, s[0:1]
+; GFX942-SDAG-NEXT:    s_load_dwordx16 s[36:51], s[4:5], 0xe4
+; GFX942-SDAG-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0xa4
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_pk_mul_f32 v[2:3], v[2:3], s[50:51]
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(6)
+; GFX942-SDAG-NEXT:    v_pk_mul_f32 v[22:23], v[14:15], s[46:47]
+; GFX942-SDAG-NEXT:    v_pk_mul_f32 v[20:21], v[12:13], s[44:45]
+; GFX942-SDAG-NEXT:    v_pk_mul_f32 v[0:1], v[0:1], s[48:49]
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(5)
+; GFX942-SDAG-NEXT:    v_pk_mul_f32 v[14:15], v[26:27], s[42:43]
+; GFX942-SDAG-NEXT:    v_pk_mul_f32 v[12:13], v[24:25], s[40:41]
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(4)
+; GFX942-SDAG-NEXT:    v_pk_mul_f32 v[26:27], v[30:31], s[38:39]
+; GFX942-SDAG-NEXT:    v_pk_mul_f32 v[24:25], v[28:29], s[36:37]
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(3)
+; GFX942-SDAG-NEXT:    v_pk_mul_f32 v[30:31], v[34:35], s[22:23]
+; GFX942-SDAG-NEXT:    v_pk_mul_f32 v[28:29], v[32:33], s[20:21]
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(2)
+; GFX942-SDAG-NEXT:    v_pk_mul_f32 v[18:19], v[18:19], s[18:19]
+; GFX942-SDAG-NEXT:    v_pk_mul_f32 v[16:17], v[16:17], s[16:17]
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX942-SDAG-NEXT:    v_pk_mul_f32 v[10:11], v[10:11], s[14:15]
+; GFX942-SDAG-NEXT:    v_pk_mul_f32 v[8:9], v[8:9], s[12:13]
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-SDAG-NEXT:    v_pk_mul_f32 v[6:7], v[6:7], s[10:11]
+; GFX942-SDAG-NEXT:    v_pk_mul_f32 v[4:5], v[4:5], s[8:9]
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v36, v[20:23], s[0:1] offset:96
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v36, v[0:3], s[0:1] offset:112
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v36, v[24:27], s[0:1] offset:64
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v36, v[12:15], s[0:1] offset:80
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v36, v[16:19], s[0:1] offset:32
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v36, v[28:31], s[0:1] offset:48
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v36, v[4:7], s[0:1]
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v36, v[8:11], s[0:1] offset:16
+; GFX942-SDAG-NEXT:    s_endpgm
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr inbounds <32 x float>, ptr addrspace(1) %a, i32 %id
   %load = load <32 x float>, ptr addrspace(1) %gep, align 128
@@ -1466,52 +1566,52 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
 ; GFX900-NEXT:    global_store_dwordx4 v0, v[1:4], s[0:1] offset:16
 ; GFX900-NEXT:    s_endpgm
 ;
-; PACKED-SDAG-LABEL: fma_v32_vs:
-; PACKED-SDAG:       ; %bb.0:
-; PACKED-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; PACKED-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; PACKED-SDAG-NEXT:    v_lshlrev_b32_e32 v32, 7, v0
-; PACKED-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; PACKED-SDAG-NEXT:    global_load_dwordx4 v[0:3], v32, s[0:1] offset:16
-; PACKED-SDAG-NEXT:    global_load_dwordx4 v[4:7], v32, s[0:1]
-; PACKED-SDAG-NEXT:    global_load_dwordx4 v[8:11], v32, s[0:1] offset:48
-; PACKED-SDAG-NEXT:    global_load_dwordx4 v[16:19], v32, s[0:1] offset:32
-; PACKED-SDAG-NEXT:    global_load_dwordx4 v[20:23], v32, s[0:1] offset:80
-; PACKED-SDAG-NEXT:    global_load_dwordx4 v[12:15], v32, s[0:1] offset:64
-; PACKED-SDAG-NEXT:    global_load_dwordx4 v[24:27], v32, s[0:1] offset:112
-; PACKED-SDAG-NEXT:    global_load_dwordx4 v[28:31], v32, s[0:1] offset:96
-; PACKED-SDAG-NEXT:    s_load_dwordx16 s[36:51], s[4:5], 0xa4
-; PACKED-SDAG-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0xe4
-; PACKED-SDAG-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(0)
-; PACKED-SDAG-NEXT:    v_pk_fma_f32 v[0:1], v[0:1], s[40:41], s[40:41]
-; PACKED-SDAG-NEXT:    v_pk_fma_f32 v[2:3], v[2:3], s[42:43], s[42:43]
-; PACKED-SDAG-NEXT:    s_waitcnt vmcnt(6)
-; PACKED-SDAG-NEXT:    v_pk_fma_f32 v[6:7], v[6:7], s[38:39], s[38:39]
-; PACKED-SDAG-NEXT:    s_waitcnt vmcnt(5)
-; PACKED-SDAG-NEXT:    v_pk_fma_f32 v[8:9], v[8:9], s[48:49], s[48:49]
-; PACKED-SDAG-NEXT:    v_pk_fma_f32 v[10:11], v[10:11], s[50:51], s[50:51]
-; PACKED-SDAG-NEXT:    s_waitcnt vmcnt(4)
-; PACKED-SDAG-NEXT:    v_pk_fma_f32 v[16:17], v[16:17], s[44:45], s[44:45]
-; PACKED-SDAG-NEXT:    v_pk_fma_f32 v[18:19], v[18:19], s[46:47], s[46:47]
-; PACKED-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; PACKED-SDAG-NEXT:    v_pk_fma_f32 v[28:29], v[28:29], s[16:17], s[16:17]
-; PACKED-SDAG-NEXT:    v_pk_fma_f32 v[30:31], v[30:31], s[18:19], s[18:19]
-; PACKED-SDAG-NEXT:    v_pk_fma_f32 v[20:21], v[20:21], s[12:13], s[12:13]
-; PACKED-SDAG-NEXT:    v_pk_fma_f32 v[22:23], v[22:23], s[14:15], s[14:15]
-; PACKED-SDAG-NEXT:    v_pk_fma_f32 v[14:15], v[14:15], s[10:11], s[10:11]
-; PACKED-SDAG-NEXT:    v_pk_fma_f32 v[24:25], v[24:25], s[20:21], s[20:21]
-; PACKED-SDAG-NEXT:    v_pk_fma_f32 v[26:27], v[26:27], s[22:23], s[22:23]
-; PACKED-SDAG-NEXT:    v_pk_fma_f32 v[4:5], v[4:5], s[36:37], s[36:37]
-; PACKED-SDAG-NEXT:    v_pk_fma_f32 v[12:13], v[12:13], s[8:9], s[8:9]
-; PACKED-SDAG-NEXT:    global_store_dwordx4 v32, v[28:31], s[0:1] offset:96
-; PACKED-SDAG-NEXT:    global_store_dwordx4 v32, v[24:27], s[0:1] offset:112
-; PACKED-SDAG-NEXT:    global_store_dwordx4 v32, v[12:15], s[0:1] offset:64
-; PACKED-SDAG-NEXT:    global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
-; PACKED-SDAG-NEXT:    global_store_dwordx4 v32, v[16:19], s[0:1] offset:32
-; PACKED-SDAG-NEXT:    global_store_dwordx4 v32, v[8:11], s[0:1] offset:48
-; PACKED-SDAG-NEXT:    global_store_dwordx4 v32, v[4:7], s[0:1]
-; PACKED-SDAG-NEXT:    global_store_dwordx4 v32, v[0:3], s[0:1] offset:16
-; PACKED-SDAG-NEXT:    s_endpgm
+; GFX90A-SDAG-LABEL: fma_v32_vs:
+; GFX90A-SDAG:       ; %bb.0:
+; GFX90A-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX90A-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX90A-SDAG-NEXT:    v_lshlrev_b32_e32 v32, 7, v0
+; GFX90A-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-SDAG-NEXT:    global_load_dwordx4 v[0:3], v32, s[0:1] offset:16
+; GFX90A-SDAG-NEXT:    global_load_dwordx4 v[4:7], v32, s[0:1]
+; GFX90A-SDAG-NEXT:    global_load_dwordx4 v[8:11], v32, s[0:1] offset:48
+; GFX90A-SDAG-NEXT:    global_load_dwordx4 v[16:19], v32, s[0:1] offset:32
+; GFX90A-SDAG-NEXT:    global_load_dwordx4 v[20:23], v32, s[0:1] offset:80
+; GFX90A-SDAG-NEXT:    global_load_dwordx4 v[12:15], v32, s[0:1] offset:64
+; GFX90A-SDAG-NEXT:    global_load_dwordx4 v[24:27], v32, s[0:1] offset:112
+; GFX90A-SDAG-NEXT:    global_load_dwordx4 v[28:31], v32, s[0:1] offset:96
+; GFX90A-SDAG-NEXT:    s_load_dwordx16 s[36:51], s[4:5], 0xa4
+; GFX90A-SDAG-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0xe4
+; GFX90A-SDAG-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(0)
+; GFX90A-SDAG-NEXT:    v_pk_fma_f32 v[0:1], v[0:1], s[40:41], s[40:41]
+; GFX90A-SDAG-NEXT:    v_pk_fma_f32 v[2:3], v[2:3], s[42:43], s[42:43]
+; GFX90A-SDAG-NEXT:    s_waitcnt vmcnt(6)
+; GFX90A-SDAG-NEXT:    v_pk_fma_f32 v[6:7], v[6:7], s[38:39], s[38:39]
+; GFX90A-SDAG-NEXT:    s_waitcnt vmcnt(5)
+; GFX90A-SDAG-NEXT:    v_pk_fma_f32 v[8:9], v[8:9], s[48:49], s[48:49]
+; GFX90A-SDAG-NEXT:    v_pk_fma_f32 v[10:11], v[10:11], s[50:51], s[50:51]
+; GFX90A-SDAG-NEXT:    s_waitcnt vmcnt(4)
+; GFX90A-SDAG-NEXT:    v_pk_fma_f32 v[16:17], v[16:17], s[44:45], s[44:45]
+; GFX90A-SDAG-NEXT:    v_pk_fma_f32 v[18:19], v[18:19], s[46:47], s[46:47]
+; GFX90A-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-SDAG-NEXT:    v_pk_fma_f32 v[28:29], v[28:29], s[16:17], s[16:17]
+; GFX90A-SDAG-NEXT:    v_pk_fma_f32 v[30:31], v[30:31], s[18:19], s[18:19]
+; GFX90A-SDAG-NEXT:    v_pk_fma_f32 v[20:21], v[20:21], s[12:13], s[12:13]
+; GFX90A-SDAG-NEXT:    v_pk_fma_f32 v[22:23], v[22:23], s[14:15], s[14:15]
+; GFX90A-SDAG-NEXT:    v_pk_fma_f32 v[14:15], v[14:15], s[10:11], s[10:11]
+; GFX90A-SDAG-NEXT:    v_pk_fma_f32 v[24:25], v[24:25], s[20:21], s[20:21]
+; GFX90A-SDAG-NEXT:    v_pk_fma_f32 v[26:27], v[26:27], s[22:23], s[22:23]
+; GFX90A-SDAG-NEXT:    v_pk_fma_f32 v[4:5], v[4:5], s[36:37], s[36:37]
+; GFX90A-SDAG-NEXT:    v_pk_fma_f32 v[12:13], v[12:13], s[8:9], s[8:9]
+; GFX90A-SDAG-NEXT:    global_store_dwordx4 v32, v[28:31], s[0:1] offset:96
+; GFX90A-SDAG-NEXT:    global_store_dwordx4 v32, v[24:27], s[0:1] offset:112
+; GFX90A-SDAG-NEXT:    global_store_dwordx4 v32, v[12:15], s[0:1] offset:64
+; GFX90A-SDAG-NEXT:    global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
+; GFX90A-SDAG-NEXT:    global_store_dwordx4 v32, v[16:19], s[0:1] offset:32
+; GFX90A-SDAG-NEXT:    global_store_dwordx4 v32, v[8:11], s[0:1] offset:48
+; GFX90A-SDAG-NEXT:    global_store_dwordx4 v32, v[4:7], s[0:1]
+; GFX90A-SDAG-NEXT:    global_store_dwordx4 v32, v[0:3], s[0:1] offset:16
+; GFX90A-SDAG-NEXT:    s_endpgm
 ;
 ; PACKED-GISEL-LABEL: fma_v32_vs:
 ; PACKED-GISEL:       ; %bb.0:
@@ -1562,6 +1662,56 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
 ; PACKED-GISEL-NEXT:    global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
 ; PACKED-GISEL-NEXT:    global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
 ; PACKED-GISEL-NEXT:    s_endpgm
+;
+; GFX942-SDAG-LABEL: fma_v32_vs:
+; GFX942-SDAG:       ; %bb.0:
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX942-SDAG-NEXT:    v_lshlrev_b32_e32 v24, 7, v0
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    global_load_dwordx4 v[0:3], v24, s[0:1] offset:112
+; GFX942-SDAG-NEXT:    global_load_dwordx4 v[12:15], v24, s[0:1] offset:96
+; GFX942-SDAG-NEXT:    global_load_dwordx4 v[26:29], v24, s[0:1] offset:80
+; GFX942-SDAG-NEXT:    global_load_dwordx4 v[30:33], v24, s[0:1] offset:64
+; GFX942-SDAG-NEXT:    global_load_dwordx4 v[34:37], v24, s[0:1] offset:48
+; GFX942-SDAG-NEXT:    global_load_dwordx4 v[20:23], v24, s[0:1] offset:32
+; GFX942-SDAG-NEXT:    global_load_dwordx4 v[8:11], v24, s[0:1] offset:16
+; GFX942-SDAG-NEXT:    global_load_dwordx4 v[4:7], v24, s[0:1]
+; GFX942-SDAG-NEXT:    s_load_dwordx16 s[36:51], s[4:5], 0xe4
+; GFX942-SDAG-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0xa4
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_pk_fma_f32 v[2:3], v[2:3], s[50:51], s[50:51]
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(6)
+; GFX942-SDAG-NEXT:    v_pk_fma_f32 v[18:19], v[14:15], s[46:47], s[46:47]
+; GFX942-SDAG-NEXT:    v_pk_fma_f32 v[16:17], v[12:13], s[44:45], s[44:45]
+; GFX942-SDAG-NEXT:    v_pk_fma_f32 v[0:1], v[0:1], s[48:49], s[48:49]
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(5)
+; GFX942-SDAG-NEXT:    v_pk_fma_f32 v[14:15], v[28:29], s[42:43], s[42:43]
+; GFX942-SDAG-NEXT:    v_pk_fma_f32 v[12:13], v[26:27], s[40:41], s[40:41]
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(4)
+; GFX942-SDAG-NEXT:    v_pk_fma_f32 v[28:29], v[32:33], s[38:39], s[38:39]
+; GFX942-SDAG-NEXT:    v_pk_fma_f32 v[26:27], v[30:31], s[36:37], s[36:37]
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(3)
+; GFX942-SDAG-NEXT:    v_pk_fma_f32 v[32:33], v[36:37], s[22:23], s[22:23]
+; GFX942-SDAG-NEXT:    v_pk_fma_f32 v[30:31], v[34:35], s[20:21], s[20:21]
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(2)
+; GFX942-SDAG-NEXT:    v_pk_fma_f32 v[22:23], v[22:23], s[18:19], s[18:19]
+; GFX942-SDAG-NEXT:    v_pk_fma_f32 v[20:21], v[20:21], s[16:17], s[16:17]
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX942-SDAG-NEXT:    v_pk_fma_f32 v[10:11], v[10:11], s[14:15], s[14:15]
+; GFX942-SDAG-NEXT:    v_pk_fma_f32 v[8:9], v[8:9], s[12:13], s[12:13]
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-SDAG-NEXT:    v_pk_fma_f32 v[6:7], v[6:7], s[10:11], s[10:11]
+; GFX942-SDAG-NEXT:    v_pk_fma_f32 v[4:5], v[4:5], s[8:9], s[8:9]
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v24, v[16:19], s[0:1] offset:96
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v24, v[0:3], s[0:1] offset:112
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v24, v[26:29], s[0:1] offset:64
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v24, v[12:15], s[0:1] offset:80
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v24, v[20:23], s[0:1] offset:32
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v24, v[30:33], s[0:1] offset:48
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v24, v[4:7], s[0:1]
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v24, v[8:11], s[0:1] offset:16
+; GFX942-SDAG-NEXT:    s_endpgm
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr inbounds <32 x float>, ptr addrspace(1) %a, i32 %id
   %load = load <32 x float>, ptr addrspace(1) %gep, align 128
@@ -2408,6 +2558,3 @@ declare i32 @llvm.amdgcn.workitem.id.x()
 declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>)
 declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>)
 declare <32 x float> @llvm.fma.v32f32(<32 x float>, <32 x float>, <32 x float>)
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; GFX90A-SDAG: {{.*}}
-; GFX942-SDAG: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll b/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll
index 41fe0d48ec819..39544dd7af50c 100644
--- a/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll
+++ b/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll
@@ -476,10 +476,10 @@ define amdgpu_kernel void @v3f32_preload_arg(ptr addrspace(1) nocapture inreg %o
 ; GFX942-NEXT:    .p2align 8
 ; GFX942-NEXT:  ; %bb.2:
 ; GFX942-NEXT:  .LBB12_0:
-; GFX942-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX942-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX942-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX942-NEXT:    v_mov_b32_e32 v2, s8
+; GFX942-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX942-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3]
 ; GFX942-NEXT:    s_endpgm
 ;
@@ -990,10 +990,8 @@ define amdgpu_kernel void @fp128_kernel_preload_arg(ptr addrspace(1) inreg %out,
 ; GFX942-NEXT:  ; %bb.2:
 ; GFX942-NEXT:  .LBB25_0:
 ; GFX942-NEXT:    v_mov_b32_e32 v4, 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, s6
-; GFX942-NEXT:    v_mov_b32_e32 v1, s7
-; GFX942-NEXT:    v_mov_b32_e32 v2, s8
-; GFX942-NEXT:    v_mov_b32_e32 v3, s9
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], s[6:7]
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[8:9]
 ; GFX942-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
 ; GFX942-NEXT:    s_endpgm
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v2i64.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v2i64.ll
index 51dc9a51ec9d0..3fd7f1cb481a4 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v2i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v2i64.ll
@@ -88,8 +88,7 @@ define void @v_shuffle_v2i64_v2i64__1_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v4, 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, v2
-; GFX942-NEXT:    v_mov_b32_e32 v1, v3
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], v[2:3]
 ; GFX942-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -144,8 +143,7 @@ define void @v_shuffle_v2i64_v2i64__3_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v4, 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, v2
-; GFX942-NEXT:    v_mov_b32_e32 v1, v3
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], v[2:3]
 ; GFX942-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -204,10 +202,8 @@ define void @v_shuffle_v2i64_v2i64__3_0(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[2:5]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v4
-; GFX942-NEXT:    v_mov_b32_e32 v3, v5
-; GFX942-NEXT:    v_mov_b32_e32 v4, v0
-; GFX942-NEXT:    v_mov_b32_e32 v5, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[4:5]
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -262,8 +258,7 @@ define void @v_shuffle_v2i64_v2i64__3_1(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[4:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, v6
-; GFX942-NEXT:    v_mov_b32_e32 v1, v7
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], v[6:7]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -309,8 +304,7 @@ define void @v_shuffle_v2i64_v2i64__3_2(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v4, v0
-; GFX942-NEXT:    v_mov_b32_e32 v5, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -355,8 +349,7 @@ define void @v_shuffle_v2i64_v2i64__3_3(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v4, 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, v2
-; GFX942-NEXT:    v_mov_b32_e32 v1, v3
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], v[2:3]
 ; GFX942-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -401,8 +394,7 @@ define void @v_shuffle_v2i64_v2i64__u_0(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v4, 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v0
-; GFX942-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -446,8 +438,7 @@ define void @v_shuffle_v2i64_v2i64__0_0(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v4, 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v0
-; GFX942-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -492,8 +483,7 @@ define void @v_shuffle_v2i64_v2i64__1_0(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v4, v0
-; GFX942-NEXT:    v_mov_b32_e32 v5, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -537,8 +527,7 @@ define void @v_shuffle_v2i64_v2i64__2_0(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v4, 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v0
-; GFX942-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -660,8 +649,7 @@ define void @v_shuffle_v2i64_v2i64__1_1(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v4, 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, v2
-; GFX942-NEXT:    v_mov_b32_e32 v1, v3
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], v[2:3]
 ; GFX942-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -794,8 +782,7 @@ define void @v_shuffle_v2i64_v2i64__1_2(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v4, 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, v2
-; GFX942-NEXT:    v_mov_b32_e32 v1, v3
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], v[2:3]
 ; GFX942-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -900,8 +887,7 @@ define void @v_shuffle_v2i64_v2i64__0_3(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[2:5]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v4
-; GFX942-NEXT:    v_mov_b32_e32 v3, v5
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[4:5]
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -956,8 +942,7 @@ define void @v_shuffle_v2i64_v2i64__1_3(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v4, v2
-; GFX942-NEXT:    v_mov_b32_e32 v5, v3
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[2:3]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1095,8 +1080,7 @@ define void @s_shuffle_v2i64_v2i64__1_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -1154,8 +1138,7 @@ define void @s_shuffle_v2i64_v2i64__3_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -1213,10 +1196,8 @@ define void @s_shuffle_v2i64_v2i64__3_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -1270,8 +1251,7 @@ define void @s_shuffle_v2i64_v2i64__3_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -1320,10 +1300,8 @@ define void @s_shuffle_v2i64_v2i64__3_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -1336,18 +1314,43 @@ define void @s_shuffle_v2i64_v2i64__3_2() {
 }
 
 define void @s_shuffle_v2i64_v2i64__3_3() {
-; GFX9-LABEL: s_shuffle_v2i64_v2i64__3_3:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v2i64__3_3:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v2i64__3_3:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v2i64__3_3:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x i64> asm "; def $0", "=s"()
   %vec1 = call <2 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <2 x i32> <i32 3, i32 3>
@@ -1388,8 +1391,7 @@ define void @s_shuffle_v2i64_v2i64__u_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -1401,18 +1403,43 @@ define void @s_shuffle_v2i64_v2i64__u_0() {
 }
 
 define void @s_shuffle_v2i64_v2i64__0_0() {
-; GFX9-LABEL: s_shuffle_v2i64_v2i64__0_0:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s8
-; GFX9-NEXT:    s_mov_b32 s11, s9
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v2i64__0_0:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s8
+; GFX900-NEXT:    s_mov_b32 s11, s9
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v2i64__0_0:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s8
+; GFX90A-NEXT:    s_mov_b32 s11, s9
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v2i64__0_0:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[8:9]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x i64> %vec0, <2 x i64> poison, <2 x i32> zeroinitializer
   call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf)
@@ -1456,10 +1483,8 @@ define void @s_shuffle_v2i64_v2i64__1_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -1503,8 +1528,7 @@ define void @s_shuffle_v2i64_v2i64__2_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -1596,18 +1620,43 @@ define void @s_shuffle_v2i64_v2i64__0_1() {
 }
 
 define void @s_shuffle_v2i64_v2i64__1_1() {
-; GFX9-LABEL: s_shuffle_v2i64_v2i64__1_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v2i64__1_1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v2i64__1_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v2i64__1_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x i64> %vec0, <2 x i64> poison, <2 x i32> <i32 1, i32 1>
   call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf)
@@ -1741,8 +1790,7 @@ define void @s_shuffle_v2i64_v2i64__1_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -1850,8 +1898,7 @@ define void @s_shuffle_v2i64_v2i64__0_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s2
-; GFX942-NEXT:    s_mov_b32 s11, s3
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -1905,8 +1952,7 @@ define void @s_shuffle_v2i64_v2i64__1_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v3i64.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v3i64.ll
index bc8a56a30d8f9..f54d45b1367cc 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v3i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v3i64.ll
@@ -127,8 +127,7 @@ define void @v_shuffle_v2i64_v3i64__2_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v6, 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, v4
-; GFX942-NEXT:    v_mov_b32_e32 v1, v5
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], v[4:5]
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -223,8 +222,7 @@ define void @v_shuffle_v2i64_v3i64__5_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v6, 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, v4
-; GFX942-NEXT:    v_mov_b32_e32 v1, v5
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], v[4:5]
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -283,10 +281,8 @@ define void @v_shuffle_v2i64_v3i64__5_0(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[2:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v6
-; GFX942-NEXT:    v_mov_b32_e32 v3, v7
-; GFX942-NEXT:    v_mov_b32_e32 v4, v0
-; GFX942-NEXT:    v_mov_b32_e32 v5, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[6:7]
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -341,8 +337,7 @@ define void @v_shuffle_v2i64_v3i64__5_1(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[4:9]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, v8
-; GFX942-NEXT:    v_mov_b32_e32 v1, v9
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], v[8:9]
 ; GFX942-NEXT:    global_store_dwordx4 v10, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -397,8 +392,7 @@ define void @v_shuffle_v2i64_v3i64__5_2(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[6:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v10
-; GFX942-NEXT:    v_mov_b32_e32 v3, v11
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[10:11]
 ; GFX942-NEXT:    global_store_dwordx4 v12, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -447,10 +441,8 @@ define void @v_shuffle_v2i64_v3i64__5_3(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v6, 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v4
-; GFX942-NEXT:    v_mov_b32_e32 v3, v5
-; GFX942-NEXT:    v_mov_b32_e32 v4, v0
-; GFX942-NEXT:    v_mov_b32_e32 v5, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[4:5]
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -495,8 +487,7 @@ define void @v_shuffle_v2i64_v3i64__5_4(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v6, 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, v4
-; GFX942-NEXT:    v_mov_b32_e32 v1, v5
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], v[4:5]
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -541,8 +532,7 @@ define void @v_shuffle_v2i64_v3i64__5_5(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v6, 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v4
-; GFX942-NEXT:    v_mov_b32_e32 v3, v5
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[4:5]
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -587,8 +577,7 @@ define void @v_shuffle_v2i64_v3i64__u_0(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v6, 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v0
-; GFX942-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -632,8 +621,7 @@ define void @v_shuffle_v2i64_v3i64__0_0(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v6, 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v0
-; GFX942-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -677,8 +665,7 @@ define void @v_shuffle_v2i64_v3i64__1_0(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v6, 0
-; GFX942-NEXT:    v_mov_b32_e32 v4, v0
-; GFX942-NEXT:    v_mov_b32_e32 v5, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -726,10 +713,8 @@ define void @v_shuffle_v2i64_v3i64__2_0(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v6, 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v4
-; GFX942-NEXT:    v_mov_b32_e32 v3, v5
-; GFX942-NEXT:    v_mov_b32_e32 v4, v0
-; GFX942-NEXT:    v_mov_b32_e32 v5, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[4:5]
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -773,8 +758,7 @@ define void @v_shuffle_v2i64_v3i64__3_0(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v6, 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v0
-; GFX942-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -828,8 +812,7 @@ define void @v_shuffle_v2i64_v3i64__4_0(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[2:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v6, v0
-; GFX942-NEXT:    v_mov_b32_e32 v7, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[6:7], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -952,8 +935,7 @@ define void @v_shuffle_v2i64_v3i64__1_1(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v6, 0
-; GFX942-NEXT:    v_mov_b32_e32 v4, v2
-; GFX942-NEXT:    v_mov_b32_e32 v5, v3
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[2:3]
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -997,8 +979,7 @@ define void @v_shuffle_v2i64_v3i64__2_1(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v6, 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, v4
-; GFX942-NEXT:    v_mov_b32_e32 v1, v5
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], v[4:5]
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1091,8 +1072,7 @@ define void @v_shuffle_v2i64_v3i64__4_1(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[4:9]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v8, v2
-; GFX942-NEXT:    v_mov_b32_e32 v9, v3
+; GFX942-NEXT:    v_mov_b64_e32 v[8:9], v[2:3]
 ; GFX942-NEXT:    global_store_dwordx4 v10, v[6:9], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1176,8 +1156,7 @@ define void @v_shuffle_v2i64_v3i64__0_2(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v6, 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v4
-; GFX942-NEXT:    v_mov_b32_e32 v3, v5
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[4:5]
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1260,8 +1239,7 @@ define void @v_shuffle_v2i64_v3i64__2_2(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v6, 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v4
-; GFX942-NEXT:    v_mov_b32_e32 v3, v5
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[4:5]
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1354,8 +1332,7 @@ define void @v_shuffle_v2i64_v3i64__4_2(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v10, v4
-; GFX942-NEXT:    v_mov_b32_e32 v11, v5
+; GFX942-NEXT:    v_mov_b64_e32 v[10:11], v[4:5]
 ; GFX942-NEXT:    global_store_dwordx4 v12, v[8:11], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1489,8 +1466,7 @@ define void @v_shuffle_v2i64_v3i64__2_3(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v6, 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, v4
-; GFX942-NEXT:    v_mov_b32_e32 v1, v5
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], v[4:5]
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1545,8 +1521,7 @@ define void @v_shuffle_v2i64_v3i64__4_3(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v6, 0
-; GFX942-NEXT:    v_mov_b32_e32 v4, v0
-; GFX942-NEXT:    v_mov_b32_e32 v5, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1641,8 +1616,7 @@ define void @v_shuffle_v2i64_v3i64__0_4(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[2:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v4
-; GFX942-NEXT:    v_mov_b32_e32 v3, v5
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[4:5]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1697,8 +1671,7 @@ define void @v_shuffle_v2i64_v3i64__1_4(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[4:9]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v4, v6
-; GFX942-NEXT:    v_mov_b32_e32 v5, v7
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[6:7]
 ; GFX942-NEXT:    global_store_dwordx4 v10, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1753,8 +1726,7 @@ define void @v_shuffle_v2i64_v3i64__2_4(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v6, v4
-; GFX942-NEXT:    v_mov_b32_e32 v7, v5
+; GFX942-NEXT:    v_mov_b64_e32 v[6:7], v[4:5]
 ; GFX942-NEXT:    global_store_dwordx4 v12, v[6:9], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1839,8 +1811,7 @@ define void @v_shuffle_v2i64_v3i64__4_4(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v6, 0
-; GFX942-NEXT:    v_mov_b32_e32 v4, v2
-; GFX942-NEXT:    v_mov_b32_e32 v5, v3
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[2:3]
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1935,8 +1906,7 @@ define void @v_shuffle_v2i64_v3i64__0_5(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[2:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v6
-; GFX942-NEXT:    v_mov_b32_e32 v3, v7
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[6:7]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1991,8 +1961,7 @@ define void @v_shuffle_v2i64_v3i64__1_5(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[4:9]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v4, v8
-; GFX942-NEXT:    v_mov_b32_e32 v5, v9
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[8:9]
 ; GFX942-NEXT:    global_store_dwordx4 v10, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -2047,8 +2016,7 @@ define void @v_shuffle_v2i64_v3i64__2_5(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v8, v4
-; GFX942-NEXT:    v_mov_b32_e32 v9, v5
+; GFX942-NEXT:    v_mov_b64_e32 v[8:9], v[4:5]
 ; GFX942-NEXT:    global_store_dwordx4 v12, v[8:11], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -2093,8 +2061,7 @@ define void @v_shuffle_v2i64_v3i64__3_5(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v6, 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v4
-; GFX942-NEXT:    v_mov_b32_e32 v3, v5
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[4:5]
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -2232,8 +2199,7 @@ define void @s_shuffle_v2i64_v3i64__1_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -2273,8 +2239,7 @@ define void @s_shuffle_v2i64_v3i64__2_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -2332,8 +2297,7 @@ define void @s_shuffle_v2i64_v3i64__4_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -2374,8 +2338,7 @@ define void @s_shuffle_v2i64_v3i64__5_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -2430,11 +2393,11 @@ define void @s_shuffle_v2i64_v3i64__5_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s0
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:9]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -2484,8 +2447,7 @@ define void @s_shuffle_v2i64_v3i64__5_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -2539,10 +2501,8 @@ define void @s_shuffle_v2i64_v3i64__5_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -2587,10 +2547,8 @@ define void @s_shuffle_v2i64_v3i64__5_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -2603,18 +2561,43 @@ define void @s_shuffle_v2i64_v3i64__5_3() {
 }
 
 define void @s_shuffle_v2i64_v3i64__5_4() {
-; GFX9-LABEL: s_shuffle_v2i64_v3i64__5_4:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s12
-; GFX9-NEXT:    s_mov_b32 s9, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v3i64__5_4:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s12
+; GFX900-NEXT:    s_mov_b32 s9, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v3i64__5_4:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s12
+; GFX90A-NEXT:    s_mov_b32 s9, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v3i64__5_4:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x i64> asm "; def $0", "=s"()
   %vec1 = call <3 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <2 x i32> <i32 5, i32 4>
@@ -2659,10 +2642,8 @@ define void @s_shuffle_v2i64_v3i64__5_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -2707,8 +2688,7 @@ define void @s_shuffle_v2i64_v3i64__u_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -2720,18 +2700,43 @@ define void @s_shuffle_v2i64_v3i64__u_0() {
 }
 
 define void @s_shuffle_v2i64_v3i64__0_0() {
-; GFX9-LABEL: s_shuffle_v2i64_v3i64__0_0:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s8
-; GFX9-NEXT:    s_mov_b32 s11, s9
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v3i64__0_0:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s8
+; GFX900-NEXT:    s_mov_b32 s11, s9
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v3i64__0_0:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s8
+; GFX90A-NEXT:    s_mov_b32 s11, s9
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v3i64__0_0:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[8:9]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <2 x i32> zeroinitializer
   call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf)
@@ -2775,10 +2780,8 @@ define void @s_shuffle_v2i64_v3i64__1_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -2822,10 +2825,8 @@ define void @s_shuffle_v2i64_v3i64__2_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -2869,8 +2870,7 @@ define void @s_shuffle_v2i64_v3i64__3_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -2924,13 +2924,11 @@ define void @s_shuffle_v2i64_v3i64__4_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s0
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:9]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3023,18 +3021,43 @@ define void @s_shuffle_v2i64_v3i64__0_1() {
 }
 
 define void @s_shuffle_v2i64_v3i64__1_1() {
-; GFX9-LABEL: s_shuffle_v2i64_v3i64__1_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v3i64__1_1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v3i64__1_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v3i64__1_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <2 x i32> <i32 1, i32 1>
   call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf)
@@ -3042,18 +3065,43 @@ define void @s_shuffle_v2i64_v3i64__1_1() {
 }
 
 define void @s_shuffle_v2i64_v3i64__2_1() {
-; GFX9-LABEL: s_shuffle_v2i64_v3i64__2_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s12
-; GFX9-NEXT:    s_mov_b32 s9, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v3i64__2_1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s12
+; GFX900-NEXT:    s_mov_b32 s9, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v3i64__2_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s12
+; GFX90A-NEXT:    s_mov_b32 s9, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v3i64__2_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <2 x i32> <i32 2, i32 1>
   call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf)
@@ -3142,8 +3190,7 @@ define void @s_shuffle_v2i64_v3i64__4_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3188,8 +3235,7 @@ define void @s_shuffle_v2i64_v3i64__u_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3201,18 +3247,43 @@ define void @s_shuffle_v2i64_v3i64__u_2() {
 }
 
 define void @s_shuffle_v2i64_v3i64__0_2() {
-; GFX9-LABEL: s_shuffle_v2i64_v3i64__0_2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v3i64__0_2:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v3i64__0_2:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v3i64__0_2:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <2 x i32> <i32 0, i32 2>
   call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf)
@@ -3256,10 +3327,8 @@ define void @s_shuffle_v2i64_v3i64__1_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3307,10 +3376,8 @@ define void @s_shuffle_v2i64_v3i64__2_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3354,8 +3421,7 @@ define void @s_shuffle_v2i64_v3i64__3_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3412,10 +3478,8 @@ define void @s_shuffle_v2i64_v3i64__4_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3514,8 +3578,7 @@ define void @s_shuffle_v2i64_v3i64__1_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3555,8 +3618,7 @@ define void @s_shuffle_v2i64_v3i64__2_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3618,10 +3680,8 @@ define void @s_shuffle_v2i64_v3i64__4_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3716,8 +3776,7 @@ define void @s_shuffle_v2i64_v3i64__0_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s2
-; GFX942-NEXT:    s_mov_b32 s11, s3
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3771,8 +3830,7 @@ define void @s_shuffle_v2i64_v3i64__1_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3826,8 +3884,7 @@ define void @s_shuffle_v2i64_v3i64__2_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3881,18 +3938,43 @@ define void @s_shuffle_v2i64_v3i64__3_4() {
 }
 
 define void @s_shuffle_v2i64_v3i64__4_4() {
-; GFX9-LABEL: s_shuffle_v2i64_v3i64__4_4:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v3i64__4_4:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v3i64__4_4:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v3i64__4_4:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x i64> asm "; def $0", "=s"()
   %vec1 = call <3 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <2 x i32> <i32 4, i32 4>
@@ -3933,8 +4015,7 @@ define void @s_shuffle_v2i64_v3i64__u_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3988,8 +4069,7 @@ define void @s_shuffle_v2i64_v3i64__0_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -4047,10 +4127,8 @@ define void @s_shuffle_v2i64_v3i64__1_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -4104,10 +4182,8 @@ define void @s_shuffle_v2i64_v3i64__2_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -4120,18 +4196,43 @@ define void @s_shuffle_v2i64_v3i64__2_5() {
 }
 
 define void @s_shuffle_v2i64_v3i64__3_5() {
-; GFX9-LABEL: s_shuffle_v2i64_v3i64__3_5:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v3i64__3_5:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v3i64__3_5:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v3i64__3_5:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x i64> asm "; def $0", "=s"()
   %vec1 = call <3 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <2 x i32> <i32 3, i32 5>
@@ -4176,10 +4277,8 @@ define void @s_shuffle_v2i64_v3i64__4_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v4i64.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v4i64.ll
index dd42a1dd44320..1c738b8f4f1d9 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v4i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v4i64.ll
@@ -166,8 +166,7 @@ define void @v_shuffle_v2i64_v4i64__3_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v8, 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, v6
-; GFX942-NEXT:    v_mov_b32_e32 v1, v7
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], v[6:7]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -302,8 +301,7 @@ define void @v_shuffle_v2i64_v4i64__7_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v8, 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, v6
-; GFX942-NEXT:    v_mov_b32_e32 v1, v7
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], v[6:7]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -362,10 +360,8 @@ define void @v_shuffle_v2i64_v4i64__7_0(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[2:9]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v8
-; GFX942-NEXT:    v_mov_b32_e32 v3, v9
-; GFX942-NEXT:    v_mov_b32_e32 v4, v0
-; GFX942-NEXT:    v_mov_b32_e32 v5, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[8:9]
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v10, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -420,8 +416,7 @@ define void @v_shuffle_v2i64_v4i64__7_1(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, v10
-; GFX942-NEXT:    v_mov_b32_e32 v1, v11
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], v[10:11]
 ; GFX942-NEXT:    global_store_dwordx4 v12, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -476,8 +471,7 @@ define void @v_shuffle_v2i64_v4i64__7_2(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[6:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v12
-; GFX942-NEXT:    v_mov_b32_e32 v3, v13
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[12:13]
 ; GFX942-NEXT:    global_store_dwordx4 v14, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -532,8 +526,7 @@ define void @v_shuffle_v2i64_v4i64__7_3(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v4, v14
-; GFX942-NEXT:    v_mov_b32_e32 v5, v15
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[14:15]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -582,10 +575,8 @@ define void @v_shuffle_v2i64_v4i64__7_4(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v8, 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v6
-; GFX942-NEXT:    v_mov_b32_e32 v3, v7
-; GFX942-NEXT:    v_mov_b32_e32 v4, v0
-; GFX942-NEXT:    v_mov_b32_e32 v5, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[6:7]
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -630,8 +621,7 @@ define void @v_shuffle_v2i64_v4i64__7_5(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v8, 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, v6
-; GFX942-NEXT:    v_mov_b32_e32 v1, v7
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], v[6:7]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -676,8 +666,7 @@ define void @v_shuffle_v2i64_v4i64__7_6(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v8, 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v6
-; GFX942-NEXT:    v_mov_b32_e32 v3, v7
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[6:7]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -722,8 +711,7 @@ define void @v_shuffle_v2i64_v4i64__7_7(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v8, 0
-; GFX942-NEXT:    v_mov_b32_e32 v4, v6
-; GFX942-NEXT:    v_mov_b32_e32 v5, v7
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[6:7]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -768,8 +756,7 @@ define void @v_shuffle_v2i64_v4i64__u_0(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v8, 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v0
-; GFX942-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -813,8 +800,7 @@ define void @v_shuffle_v2i64_v4i64__0_0(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v8, 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v0
-; GFX942-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -858,8 +844,7 @@ define void @v_shuffle_v2i64_v4i64__1_0(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v8, 0
-; GFX942-NEXT:    v_mov_b32_e32 v4, v0
-; GFX942-NEXT:    v_mov_b32_e32 v5, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -903,8 +888,7 @@ define void @v_shuffle_v2i64_v4i64__2_0(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v8, 0
-; GFX942-NEXT:    v_mov_b32_e32 v6, v0
-; GFX942-NEXT:    v_mov_b32_e32 v7, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[6:7], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -952,10 +936,8 @@ define void @v_shuffle_v2i64_v4i64__3_0(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v8, 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v6
-; GFX942-NEXT:    v_mov_b32_e32 v3, v7
-; GFX942-NEXT:    v_mov_b32_e32 v4, v0
-; GFX942-NEXT:    v_mov_b32_e32 v5, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[6:7]
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -999,8 +981,7 @@ define void @v_shuffle_v2i64_v4i64__4_0(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v8, 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v0
-; GFX942-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1054,8 +1035,7 @@ define void @v_shuffle_v2i64_v4i64__5_0(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[2:9]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v6, v0
-; GFX942-NEXT:    v_mov_b32_e32 v7, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[6:7], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v10, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1110,8 +1090,7 @@ define void @v_shuffle_v2i64_v4i64__6_0(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[2:9]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v8, v0
-; GFX942-NEXT:    v_mov_b32_e32 v9, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[8:9], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v10, v[6:9], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1234,8 +1213,7 @@ define void @v_shuffle_v2i64_v4i64__1_1(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v8, 0
-; GFX942-NEXT:    v_mov_b32_e32 v4, v2
-; GFX942-NEXT:    v_mov_b32_e32 v5, v3
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[2:3]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1279,8 +1257,7 @@ define void @v_shuffle_v2i64_v4i64__2_1(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v8, 0
-; GFX942-NEXT:    v_mov_b32_e32 v6, v2
-; GFX942-NEXT:    v_mov_b32_e32 v7, v3
+; GFX942-NEXT:    v_mov_b64_e32 v[6:7], v[2:3]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1324,8 +1301,7 @@ define void @v_shuffle_v2i64_v4i64__3_1(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v8, 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, v6
-; GFX942-NEXT:    v_mov_b32_e32 v1, v7
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], v[6:7]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1418,8 +1394,7 @@ define void @v_shuffle_v2i64_v4i64__5_1(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v8, v2
-; GFX942-NEXT:    v_mov_b32_e32 v9, v3
+; GFX942-NEXT:    v_mov_b64_e32 v[8:9], v[2:3]
 ; GFX942-NEXT:    global_store_dwordx4 v12, v[6:9], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1474,8 +1449,7 @@ define void @v_shuffle_v2i64_v4i64__6_1(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v10, v2
-; GFX942-NEXT:    v_mov_b32_e32 v11, v3
+; GFX942-NEXT:    v_mov_b64_e32 v[10:11], v[2:3]
 ; GFX942-NEXT:    global_store_dwordx4 v12, v[8:11], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1559,8 +1533,7 @@ define void @v_shuffle_v2i64_v4i64__0_2(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v8, 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v4
-; GFX942-NEXT:    v_mov_b32_e32 v3, v5
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[4:5]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1643,8 +1616,7 @@ define void @v_shuffle_v2i64_v4i64__2_2(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v8, 0
-; GFX942-NEXT:    v_mov_b32_e32 v6, v4
-; GFX942-NEXT:    v_mov_b32_e32 v7, v5
+; GFX942-NEXT:    v_mov_b64_e32 v[6:7], v[4:5]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1688,8 +1660,7 @@ define void @v_shuffle_v2i64_v4i64__3_2(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v8, 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v6
-; GFX942-NEXT:    v_mov_b32_e32 v3, v7
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[6:7]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1782,8 +1753,7 @@ define void @v_shuffle_v2i64_v4i64__5_2(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[6:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v10, v4
-; GFX942-NEXT:    v_mov_b32_e32 v11, v5
+; GFX942-NEXT:    v_mov_b64_e32 v[10:11], v[4:5]
 ; GFX942-NEXT:    global_store_dwordx4 v14, v[8:11], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1838,8 +1808,7 @@ define void @v_shuffle_v2i64_v4i64__6_2(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[6:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v12, v4
-; GFX942-NEXT:    v_mov_b32_e32 v13, v5
+; GFX942-NEXT:    v_mov_b64_e32 v[12:13], v[4:5]
 ; GFX942-NEXT:    global_store_dwordx4 v14, v[10:13], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1923,8 +1892,7 @@ define void @v_shuffle_v2i64_v4i64__0_3(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v8, 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v6
-; GFX942-NEXT:    v_mov_b32_e32 v3, v7
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[6:7]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1968,8 +1936,7 @@ define void @v_shuffle_v2i64_v4i64__1_3(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v8, 0
-; GFX942-NEXT:    v_mov_b32_e32 v4, v6
-; GFX942-NEXT:    v_mov_b32_e32 v5, v7
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[6:7]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -2052,8 +2019,7 @@ define void @v_shuffle_v2i64_v4i64__3_3(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v8, 0
-; GFX942-NEXT:    v_mov_b32_e32 v4, v6
-; GFX942-NEXT:    v_mov_b32_e32 v5, v7
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[6:7]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -2146,8 +2112,7 @@ define void @v_shuffle_v2i64_v4i64__5_3(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v12, v6
-; GFX942-NEXT:    v_mov_b32_e32 v13, v7
+; GFX942-NEXT:    v_mov_b64_e32 v[12:13], v[6:7]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[10:13], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -2202,8 +2167,7 @@ define void @v_shuffle_v2i64_v4i64__6_3(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v14, v6
-; GFX942-NEXT:    v_mov_b32_e32 v15, v7
+; GFX942-NEXT:    v_mov_b64_e32 v[14:15], v[6:7]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[12:15], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -2376,8 +2340,7 @@ define void @v_shuffle_v2i64_v4i64__3_4(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v8, 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, v6
-; GFX942-NEXT:    v_mov_b32_e32 v1, v7
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], v[6:7]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -2432,8 +2395,7 @@ define void @v_shuffle_v2i64_v4i64__5_4(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v8, 0
-; GFX942-NEXT:    v_mov_b32_e32 v4, v0
-; GFX942-NEXT:    v_mov_b32_e32 v5, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -2478,8 +2440,7 @@ define void @v_shuffle_v2i64_v4i64__6_4(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v8, 0
-; GFX942-NEXT:    v_mov_b32_e32 v6, v0
-; GFX942-NEXT:    v_mov_b32_e32 v7, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[6:7], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -2574,8 +2535,7 @@ define void @v_shuffle_v2i64_v4i64__0_5(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[2:9]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v4
-; GFX942-NEXT:    v_mov_b32_e32 v3, v5
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[4:5]
 ; GFX942-NEXT:    global_store_dwordx4 v10, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -2630,8 +2590,7 @@ define void @v_shuffle_v2i64_v4i64__1_5(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v4, v6
-; GFX942-NEXT:    v_mov_b32_e32 v5, v7
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[6:7]
 ; GFX942-NEXT:    global_store_dwordx4 v12, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -2686,8 +2645,7 @@ define void @v_shuffle_v2i64_v4i64__2_5(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[6:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v6, v8
-; GFX942-NEXT:    v_mov_b32_e32 v7, v9
+; GFX942-NEXT:    v_mov_b64_e32 v[6:7], v[8:9]
 ; GFX942-NEXT:    global_store_dwordx4 v14, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -2742,8 +2700,7 @@ define void @v_shuffle_v2i64_v4i64__3_5(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v8, v6
-; GFX942-NEXT:    v_mov_b32_e32 v9, v7
+; GFX942-NEXT:    v_mov_b64_e32 v[8:9], v[6:7]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -2828,8 +2785,7 @@ define void @v_shuffle_v2i64_v4i64__5_5(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v8, 0
-; GFX942-NEXT:    v_mov_b32_e32 v4, v2
-; GFX942-NEXT:    v_mov_b32_e32 v5, v3
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[2:3]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -2874,8 +2830,7 @@ define void @v_shuffle_v2i64_v4i64__6_5(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v8, 0
-; GFX942-NEXT:    v_mov_b32_e32 v6, v2
-; GFX942-NEXT:    v_mov_b32_e32 v7, v3
+; GFX942-NEXT:    v_mov_b64_e32 v[6:7], v[2:3]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -2970,8 +2925,7 @@ define void @v_shuffle_v2i64_v4i64__0_6(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[2:9]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v6
-; GFX942-NEXT:    v_mov_b32_e32 v3, v7
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[6:7]
 ; GFX942-NEXT:    global_store_dwordx4 v10, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -3026,8 +2980,7 @@ define void @v_shuffle_v2i64_v4i64__1_6(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v4, v8
-; GFX942-NEXT:    v_mov_b32_e32 v5, v9
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[8:9]
 ; GFX942-NEXT:    global_store_dwordx4 v12, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -3082,8 +3035,7 @@ define void @v_shuffle_v2i64_v4i64__2_6(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[6:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v6, v10
-; GFX942-NEXT:    v_mov_b32_e32 v7, v11
+; GFX942-NEXT:    v_mov_b64_e32 v[6:7], v[10:11]
 ; GFX942-NEXT:    global_store_dwordx4 v14, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -3138,8 +3090,7 @@ define void @v_shuffle_v2i64_v4i64__3_6(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v10, v6
-; GFX942-NEXT:    v_mov_b32_e32 v11, v7
+; GFX942-NEXT:    v_mov_b64_e32 v[10:11], v[6:7]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[10:13], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -3184,8 +3135,7 @@ define void @v_shuffle_v2i64_v4i64__4_6(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v8, 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v4
-; GFX942-NEXT:    v_mov_b32_e32 v3, v5
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[4:5]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -3270,8 +3220,7 @@ define void @v_shuffle_v2i64_v4i64__6_6(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v8, 0
-; GFX942-NEXT:    v_mov_b32_e32 v6, v4
-; GFX942-NEXT:    v_mov_b32_e32 v7, v5
+; GFX942-NEXT:    v_mov_b64_e32 v[6:7], v[4:5]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -3366,8 +3315,7 @@ define void @v_shuffle_v2i64_v4i64__0_7(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[2:9]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v8
-; GFX942-NEXT:    v_mov_b32_e32 v3, v9
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[8:9]
 ; GFX942-NEXT:    global_store_dwordx4 v10, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -3422,8 +3370,7 @@ define void @v_shuffle_v2i64_v4i64__1_7(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v4, v10
-; GFX942-NEXT:    v_mov_b32_e32 v5, v11
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[10:11]
 ; GFX942-NEXT:    global_store_dwordx4 v12, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -3478,8 +3425,7 @@ define void @v_shuffle_v2i64_v4i64__2_7(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[6:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v6, v12
-; GFX942-NEXT:    v_mov_b32_e32 v7, v13
+; GFX942-NEXT:    v_mov_b64_e32 v[6:7], v[12:13]
 ; GFX942-NEXT:    global_store_dwordx4 v14, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -3534,8 +3480,7 @@ define void @v_shuffle_v2i64_v4i64__3_7(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v12, v6
-; GFX942-NEXT:    v_mov_b32_e32 v13, v7
+; GFX942-NEXT:    v_mov_b64_e32 v[12:13], v[6:7]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[12:15], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -3580,8 +3525,7 @@ define void @v_shuffle_v2i64_v4i64__4_7(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v8, 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v6
-; GFX942-NEXT:    v_mov_b32_e32 v3, v7
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[6:7]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -3626,8 +3570,7 @@ define void @v_shuffle_v2i64_v4i64__5_7(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v8, 0
-; GFX942-NEXT:    v_mov_b32_e32 v4, v6
-; GFX942-NEXT:    v_mov_b32_e32 v5, v7
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[6:7]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -3765,8 +3708,7 @@ define void @s_shuffle_v2i64_v4i64__1_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3850,8 +3792,7 @@ define void @s_shuffle_v2i64_v4i64__3_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3909,8 +3850,7 @@ define void @s_shuffle_v2i64_v4i64__5_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3996,8 +3936,7 @@ define void @s_shuffle_v2i64_v4i64__7_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -4056,10 +3995,8 @@ define void @s_shuffle_v2i64_v4i64__7_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -4113,8 +4050,7 @@ define void @s_shuffle_v2i64_v4i64__7_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -4172,10 +4108,8 @@ define void @s_shuffle_v2i64_v4i64__7_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -4230,8 +4164,7 @@ define void @s_shuffle_v2i64_v4i64__7_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -4280,10 +4213,8 @@ define void @s_shuffle_v2i64_v4i64__7_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -4296,18 +4227,43 @@ define void @s_shuffle_v2i64_v4i64__7_4() {
 }
 
 define void @s_shuffle_v2i64_v4i64__7_5() {
-; GFX9-LABEL: s_shuffle_v2i64_v4i64__7_5:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v4i64__7_5:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v4i64__7_5:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v4i64__7_5:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <2 x i32> <i32 7, i32 5>
@@ -4352,10 +4308,8 @@ define void @s_shuffle_v2i64_v4i64__7_6() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -4368,18 +4322,43 @@ define void @s_shuffle_v2i64_v4i64__7_6() {
 }
 
 define void @s_shuffle_v2i64_v4i64__7_7() {
-; GFX9-LABEL: s_shuffle_v2i64_v4i64__7_7:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[4:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v4i64__7_7:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v4i64__7_7:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v4i64__7_7:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[4:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <2 x i32> <i32 7, i32 7>
@@ -4420,8 +4399,7 @@ define void @s_shuffle_v2i64_v4i64__u_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -4433,18 +4411,43 @@ define void @s_shuffle_v2i64_v4i64__u_0() {
 }
 
 define void @s_shuffle_v2i64_v4i64__0_0() {
-; GFX9-LABEL: s_shuffle_v2i64_v4i64__0_0:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s8
-; GFX9-NEXT:    s_mov_b32 s11, s9
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v4i64__0_0:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s8
+; GFX900-NEXT:    s_mov_b32 s11, s9
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v4i64__0_0:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s8
+; GFX90A-NEXT:    s_mov_b32 s11, s9
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v4i64__0_0:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[8:9]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <2 x i32> zeroinitializer
   call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf)
@@ -4488,10 +4491,8 @@ define void @s_shuffle_v2i64_v4i64__1_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -4503,18 +4504,43 @@ define void @s_shuffle_v2i64_v4i64__1_0() {
 }
 
 define void @s_shuffle_v2i64_v4i64__2_0() {
-; GFX9-LABEL: s_shuffle_v2i64_v4i64__2_0:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[4:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s4
-; GFX9-NEXT:    s_mov_b32 s11, s5
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v4i64__2_0:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s4
+; GFX900-NEXT:    s_mov_b32 s11, s5
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v4i64__2_0:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s4
+; GFX90A-NEXT:    s_mov_b32 s11, s5
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v4i64__2_0:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[4:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <2 x i32> <i32 2, i32 0>
   call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf)
@@ -4558,10 +4584,8 @@ define void @s_shuffle_v2i64_v4i64__3_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -4605,8 +4629,7 @@ define void @s_shuffle_v2i64_v4i64__4_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -4664,10 +4687,8 @@ define void @s_shuffle_v2i64_v4i64__5_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -4722,8 +4743,7 @@ define void @s_shuffle_v2i64_v4i64__6_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -4816,18 +4836,43 @@ define void @s_shuffle_v2i64_v4i64__0_1() {
 }
 
 define void @s_shuffle_v2i64_v4i64__1_1() {
-; GFX9-LABEL: s_shuffle_v2i64_v4i64__1_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v4i64__1_1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v4i64__1_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v4i64__1_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <2 x i32> <i32 1, i32 1>
   call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf)
@@ -4835,18 +4880,43 @@ define void @s_shuffle_v2i64_v4i64__1_1() {
 }
 
 define void @s_shuffle_v2i64_v4i64__2_1() {
-; GFX9-LABEL: s_shuffle_v2i64_v4i64__2_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[4:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s6
-; GFX9-NEXT:    s_mov_b32 s11, s7
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v4i64__2_1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s6
+; GFX900-NEXT:    s_mov_b32 s11, s7
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v4i64__2_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s6
+; GFX90A-NEXT:    s_mov_b32 s11, s7
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v4i64__2_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[4:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <2 x i32> <i32 2, i32 1>
   call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf)
@@ -4854,18 +4924,43 @@ define void @s_shuffle_v2i64_v4i64__2_1() {
 }
 
 define void @s_shuffle_v2i64_v4i64__3_1() {
-; GFX9-LABEL: s_shuffle_v2i64_v4i64__3_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v4i64__3_1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v4i64__3_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v4i64__3_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <2 x i32> <i32 3, i32 1>
   call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf)
@@ -4954,8 +5049,7 @@ define void @s_shuffle_v2i64_v4i64__5_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5010,8 +5104,7 @@ define void @s_shuffle_v2i64_v4i64__6_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s2
-; GFX942-NEXT:    s_mov_b32 s11, s3
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5056,8 +5149,7 @@ define void @s_shuffle_v2i64_v4i64__u_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5069,18 +5161,43 @@ define void @s_shuffle_v2i64_v4i64__u_2() {
 }
 
 define void @s_shuffle_v2i64_v4i64__0_2() {
-; GFX9-LABEL: s_shuffle_v2i64_v4i64__0_2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v4i64__0_2:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v4i64__0_2:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v4i64__0_2:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <2 x i32> <i32 0, i32 2>
   call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf)
@@ -5124,10 +5241,8 @@ define void @s_shuffle_v2i64_v4i64__1_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5139,18 +5254,43 @@ define void @s_shuffle_v2i64_v4i64__1_2() {
 }
 
 define void @s_shuffle_v2i64_v4i64__2_2() {
-; GFX9-LABEL: s_shuffle_v2i64_v4i64__2_2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[4:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s8
-; GFX9-NEXT:    s_mov_b32 s11, s9
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v4i64__2_2:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s8
+; GFX900-NEXT:    s_mov_b32 s11, s9
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v4i64__2_2:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s8
+; GFX90A-NEXT:    s_mov_b32 s11, s9
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v4i64__2_2:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[4:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[8:9]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <2 x i32> <i32 2, i32 2>
   call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf)
@@ -5194,10 +5334,8 @@ define void @s_shuffle_v2i64_v4i64__3_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5241,8 +5379,7 @@ define void @s_shuffle_v2i64_v4i64__4_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5299,10 +5436,8 @@ define void @s_shuffle_v2i64_v4i64__5_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5357,8 +5492,7 @@ define void @s_shuffle_v2i64_v4i64__6_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5411,18 +5545,43 @@ define void @s_shuffle_v2i64_v4i64__u_3() {
 }
 
 define void @s_shuffle_v2i64_v4i64__0_3() {
-; GFX9-LABEL: s_shuffle_v2i64_v4i64__0_3:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s14
-; GFX9-NEXT:    s_mov_b32 s11, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v4i64__0_3:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v4i64__0_3:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v4i64__0_3:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <2 x i32> <i32 0, i32 3>
   call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf)
@@ -5430,18 +5589,43 @@ define void @s_shuffle_v2i64_v4i64__0_3() {
 }
 
 define void @s_shuffle_v2i64_v4i64__1_3() {
-; GFX9-LABEL: s_shuffle_v2i64_v4i64__1_3:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[4:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s6
-; GFX9-NEXT:    s_mov_b32 s9, s7
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v4i64__1_3:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s6
+; GFX900-NEXT:    s_mov_b32 s9, s7
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v4i64__1_3:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s6
+; GFX90A-NEXT:    s_mov_b32 s9, s7
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v4i64__1_3:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[4:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <2 x i32> <i32 1, i32 3>
   call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf)
@@ -5489,18 +5673,43 @@ define void @s_shuffle_v2i64_v4i64__2_3() {
 }
 
 define void @s_shuffle_v2i64_v4i64__3_3() {
-; GFX9-LABEL: s_shuffle_v2i64_v4i64__3_3:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[4:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v4i64__3_3:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v4i64__3_3:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v4i64__3_3:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[4:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <2 x i32> <i32 3, i32 3>
   call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf)
@@ -5590,8 +5799,7 @@ define void @s_shuffle_v2i64_v4i64__5_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5646,8 +5854,7 @@ define void @s_shuffle_v2i64_v4i64__6_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5746,8 +5953,7 @@ define void @s_shuffle_v2i64_v4i64__1_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5831,8 +6037,7 @@ define void @s_shuffle_v2i64_v4i64__3_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5894,10 +6099,8 @@ define void @s_shuffle_v2i64_v4i64__5_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5910,18 +6113,43 @@ define void @s_shuffle_v2i64_v4i64__5_4() {
 }
 
 define void @s_shuffle_v2i64_v4i64__6_4() {
-; GFX9-LABEL: s_shuffle_v2i64_v4i64__6_4:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[4:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s4
-; GFX9-NEXT:    s_mov_b32 s11, s5
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v4i64__6_4:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s4
+; GFX900-NEXT:    s_mov_b32 s11, s5
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v4i64__6_4:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s4
+; GFX90A-NEXT:    s_mov_b32 s11, s5
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v4i64__6_4:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[4:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <2 x i32> <i32 6, i32 4>
@@ -6012,8 +6240,7 @@ define void @s_shuffle_v2i64_v4i64__0_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s2
-; GFX942-NEXT:    s_mov_b32 s11, s3
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -6067,8 +6294,7 @@ define void @s_shuffle_v2i64_v4i64__1_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -6123,8 +6349,7 @@ define void @s_shuffle_v2i64_v4i64__2_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s2
-; GFX942-NEXT:    s_mov_b32 s11, s3
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -6178,8 +6403,7 @@ define void @s_shuffle_v2i64_v4i64__3_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -6233,18 +6457,43 @@ define void @s_shuffle_v2i64_v4i64__4_5() {
 }
 
 define void @s_shuffle_v2i64_v4i64__5_5() {
-; GFX9-LABEL: s_shuffle_v2i64_v4i64__5_5:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v4i64__5_5:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v4i64__5_5:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v4i64__5_5:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <2 x i32> <i32 5, i32 5>
@@ -6253,18 +6502,43 @@ define void @s_shuffle_v2i64_v4i64__5_5() {
 }
 
 define void @s_shuffle_v2i64_v4i64__6_5() {
-; GFX9-LABEL: s_shuffle_v2i64_v4i64__6_5:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[4:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s6
-; GFX9-NEXT:    s_mov_b32 s11, s7
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v4i64__6_5:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s6
+; GFX900-NEXT:    s_mov_b32 s11, s7
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v4i64__6_5:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s6
+; GFX90A-NEXT:    s_mov_b32 s11, s7
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v4i64__6_5:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[4:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <2 x i32> <i32 6, i32 5>
@@ -6305,8 +6579,7 @@ define void @s_shuffle_v2i64_v4i64__u_6() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -6360,8 +6633,7 @@ define void @s_shuffle_v2i64_v4i64__0_6() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -6419,10 +6691,8 @@ define void @s_shuffle_v2i64_v4i64__1_6() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -6477,8 +6747,7 @@ define void @s_shuffle_v2i64_v4i64__2_6() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -6536,10 +6805,8 @@ define void @s_shuffle_v2i64_v4i64__3_6() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -6552,18 +6819,43 @@ define void @s_shuffle_v2i64_v4i64__3_6() {
 }
 
 define void @s_shuffle_v2i64_v4i64__4_6() {
-; GFX9-LABEL: s_shuffle_v2i64_v4i64__4_6:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v4i64__4_6:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v4i64__4_6:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v4i64__4_6:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <2 x i32> <i32 4, i32 6>
@@ -6608,10 +6900,8 @@ define void @s_shuffle_v2i64_v4i64__5_6() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -6624,18 +6914,43 @@ define void @s_shuffle_v2i64_v4i64__5_6() {
 }
 
 define void @s_shuffle_v2i64_v4i64__6_6() {
-; GFX9-LABEL: s_shuffle_v2i64_v4i64__6_6:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[4:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s8
-; GFX9-NEXT:    s_mov_b32 s11, s9
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v4i64__6_6:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s8
+; GFX900-NEXT:    s_mov_b32 s11, s9
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v4i64__6_6:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s8
+; GFX90A-NEXT:    s_mov_b32 s11, s9
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v4i64__6_6:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[4:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[8:9]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <2 x i32> <i32 6, i32 6>
@@ -6726,8 +7041,7 @@ define void @s_shuffle_v2i64_v4i64__0_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -6782,8 +7096,7 @@ define void @s_shuffle_v2i64_v4i64__1_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -6838,8 +7151,7 @@ define void @s_shuffle_v2i64_v4i64__2_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -6894,8 +7206,7 @@ define void @s_shuffle_v2i64_v4i64__3_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -6908,18 +7219,43 @@ define void @s_shuffle_v2i64_v4i64__3_7() {
 }
 
 define void @s_shuffle_v2i64_v4i64__4_7() {
-; GFX9-LABEL: s_shuffle_v2i64_v4i64__4_7:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s14
-; GFX9-NEXT:    s_mov_b32 s11, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v4i64__4_7:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v4i64__4_7:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v4i64__4_7:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <2 x i32> <i32 4, i32 7>
@@ -6928,18 +7264,43 @@ define void @s_shuffle_v2i64_v4i64__4_7() {
 }
 
 define void @s_shuffle_v2i64_v4i64__5_7() {
-; GFX9-LABEL: s_shuffle_v2i64_v4i64__5_7:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[4:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s6
-; GFX9-NEXT:    s_mov_b32 s9, s7
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v4i64__5_7:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s6
+; GFX900-NEXT:    s_mov_b32 s9, s7
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v4i64__5_7:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s6
+; GFX90A-NEXT:    s_mov_b32 s9, s7
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v4i64__5_7:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[4:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <2 x i32> <i32 5, i32 7>
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v8i64.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v8i64.ll
index 7ee7c83e0122d..c8aac3a841c69 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v8i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v8i64.ll
@@ -322,8 +322,7 @@ define void @v_shuffle_v2i64_v8i64__7_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, v14
-; GFX942-NEXT:    v_mov_b32_e32 v1, v15
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], v[14:15]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -618,8 +617,7 @@ define void @v_shuffle_v2i64_v8i64__15_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, v14
-; GFX942-NEXT:    v_mov_b32_e32 v1, v15
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], v[14:15]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -678,10 +676,8 @@ define void @v_shuffle_v2i64_v8i64__15_0(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[2:17]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v16
-; GFX942-NEXT:    v_mov_b32_e32 v3, v17
-; GFX942-NEXT:    v_mov_b32_e32 v4, v0
-; GFX942-NEXT:    v_mov_b32_e32 v5, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[16:17]
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v18, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -736,8 +732,7 @@ define void @v_shuffle_v2i64_v8i64__15_1(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[4:19]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, v18
-; GFX942-NEXT:    v_mov_b32_e32 v1, v19
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], v[18:19]
 ; GFX942-NEXT:    global_store_dwordx4 v20, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -792,8 +787,7 @@ define void @v_shuffle_v2i64_v8i64__15_2(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[6:21]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v20
-; GFX942-NEXT:    v_mov_b32_e32 v3, v21
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[20:21]
 ; GFX942-NEXT:    global_store_dwordx4 v22, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -848,8 +842,7 @@ define void @v_shuffle_v2i64_v8i64__15_3(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[8:23]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v4, v22
-; GFX942-NEXT:    v_mov_b32_e32 v5, v23
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[22:23]
 ; GFX942-NEXT:    global_store_dwordx4 v24, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -904,8 +897,7 @@ define void @v_shuffle_v2i64_v8i64__15_4(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[10:25]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v6, v24
-; GFX942-NEXT:    v_mov_b32_e32 v7, v25
+; GFX942-NEXT:    v_mov_b64_e32 v[6:7], v[24:25]
 ; GFX942-NEXT:    global_store_dwordx4 v26, v[6:9], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -960,8 +952,7 @@ define void @v_shuffle_v2i64_v8i64__15_5(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[12:27]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v8, v26
-; GFX942-NEXT:    v_mov_b32_e32 v9, v27
+; GFX942-NEXT:    v_mov_b64_e32 v[8:9], v[26:27]
 ; GFX942-NEXT:    global_store_dwordx4 v28, v[8:11], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1016,8 +1007,7 @@ define void @v_shuffle_v2i64_v8i64__15_6(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[14:29]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v10, v28
-; GFX942-NEXT:    v_mov_b32_e32 v11, v29
+; GFX942-NEXT:    v_mov_b64_e32 v[10:11], v[28:29]
 ; GFX942-NEXT:    global_store_dwordx4 v30, v[10:13], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1072,8 +1062,7 @@ define void @v_shuffle_v2i64_v8i64__15_7(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[16:31]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v12, v30
-; GFX942-NEXT:    v_mov_b32_e32 v13, v31
+; GFX942-NEXT:    v_mov_b64_e32 v[12:13], v[30:31]
 ; GFX942-NEXT:    global_store_dwordx4 v32, v[12:15], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1122,10 +1111,8 @@ define void @v_shuffle_v2i64_v8i64__15_8(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v14
-; GFX942-NEXT:    v_mov_b32_e32 v3, v15
-; GFX942-NEXT:    v_mov_b32_e32 v4, v0
-; GFX942-NEXT:    v_mov_b32_e32 v5, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[14:15]
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1170,8 +1157,7 @@ define void @v_shuffle_v2i64_v8i64__15_9(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, v14
-; GFX942-NEXT:    v_mov_b32_e32 v1, v15
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], v[14:15]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1216,8 +1202,7 @@ define void @v_shuffle_v2i64_v8i64__15_10(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v14
-; GFX942-NEXT:    v_mov_b32_e32 v3, v15
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[14:15]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1262,8 +1247,7 @@ define void @v_shuffle_v2i64_v8i64__15_11(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v4, v14
-; GFX942-NEXT:    v_mov_b32_e32 v5, v15
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[14:15]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1308,8 +1292,7 @@ define void @v_shuffle_v2i64_v8i64__15_12(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v6, v14
-; GFX942-NEXT:    v_mov_b32_e32 v7, v15
+; GFX942-NEXT:    v_mov_b64_e32 v[6:7], v[14:15]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[6:9], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1354,8 +1337,7 @@ define void @v_shuffle_v2i64_v8i64__15_13(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v8, v14
-; GFX942-NEXT:    v_mov_b32_e32 v9, v15
+; GFX942-NEXT:    v_mov_b64_e32 v[8:9], v[14:15]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1400,8 +1382,7 @@ define void @v_shuffle_v2i64_v8i64__15_14(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v10, v14
-; GFX942-NEXT:    v_mov_b32_e32 v11, v15
+; GFX942-NEXT:    v_mov_b64_e32 v[10:11], v[14:15]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[10:13], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1446,8 +1427,7 @@ define void @v_shuffle_v2i64_v8i64__15_15(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v12, v14
-; GFX942-NEXT:    v_mov_b32_e32 v13, v15
+; GFX942-NEXT:    v_mov_b64_e32 v[12:13], v[14:15]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[12:15], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1492,8 +1472,7 @@ define void @v_shuffle_v2i64_v8i64__u_0(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v0
-; GFX942-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1537,8 +1516,7 @@ define void @v_shuffle_v2i64_v8i64__0_0(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v0
-; GFX942-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1582,8 +1560,7 @@ define void @v_shuffle_v2i64_v8i64__1_0(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v4, v0
-; GFX942-NEXT:    v_mov_b32_e32 v5, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1627,8 +1604,7 @@ define void @v_shuffle_v2i64_v8i64__2_0(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v6, v0
-; GFX942-NEXT:    v_mov_b32_e32 v7, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[6:7], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1672,8 +1648,7 @@ define void @v_shuffle_v2i64_v8i64__3_0(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v8, v0
-; GFX942-NEXT:    v_mov_b32_e32 v9, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[8:9], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[6:9], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1717,8 +1692,7 @@ define void @v_shuffle_v2i64_v8i64__4_0(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v10, v0
-; GFX942-NEXT:    v_mov_b32_e32 v11, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[10:11], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1762,8 +1736,7 @@ define void @v_shuffle_v2i64_v8i64__5_0(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v12, v0
-; GFX942-NEXT:    v_mov_b32_e32 v13, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[12:13], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[10:13], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1807,8 +1780,7 @@ define void @v_shuffle_v2i64_v8i64__6_0(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v14, v0
-; GFX942-NEXT:    v_mov_b32_e32 v15, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[14:15], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[12:15], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1856,10 +1828,8 @@ define void @v_shuffle_v2i64_v8i64__7_0(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v14
-; GFX942-NEXT:    v_mov_b32_e32 v3, v15
-; GFX942-NEXT:    v_mov_b32_e32 v4, v0
-; GFX942-NEXT:    v_mov_b32_e32 v5, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[14:15]
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1903,8 +1873,7 @@ define void @v_shuffle_v2i64_v8i64__8_0(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v0
-; GFX942-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1958,8 +1927,7 @@ define void @v_shuffle_v2i64_v8i64__9_0(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[2:17]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v6, v0
-; GFX942-NEXT:    v_mov_b32_e32 v7, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[6:7], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v18, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -2014,8 +1982,7 @@ define void @v_shuffle_v2i64_v8i64__10_0(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[2:17]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v8, v0
-; GFX942-NEXT:    v_mov_b32_e32 v9, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[8:9], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v18, v[6:9], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -2070,8 +2037,7 @@ define void @v_shuffle_v2i64_v8i64__11_0(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[2:17]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v10, v0
-; GFX942-NEXT:    v_mov_b32_e32 v11, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[10:11], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v18, v[8:11], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -2126,8 +2092,7 @@ define void @v_shuffle_v2i64_v8i64__12_0(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[2:17]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v12, v0
-; GFX942-NEXT:    v_mov_b32_e32 v13, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[12:13], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v18, v[10:13], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -2182,8 +2147,7 @@ define void @v_shuffle_v2i64_v8i64__13_0(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[2:17]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v14, v0
-; GFX942-NEXT:    v_mov_b32_e32 v15, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[14:15], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v18, v[12:15], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -2238,8 +2202,7 @@ define void @v_shuffle_v2i64_v8i64__14_0(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[2:17]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v16, v0
-; GFX942-NEXT:    v_mov_b32_e32 v17, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[16:17], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v18, v[14:17], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -2362,8 +2325,7 @@ define void @v_shuffle_v2i64_v8i64__1_1(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v4, v2
-; GFX942-NEXT:    v_mov_b32_e32 v5, v3
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[2:3]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -2407,8 +2369,7 @@ define void @v_shuffle_v2i64_v8i64__2_1(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v6, v2
-; GFX942-NEXT:    v_mov_b32_e32 v7, v3
+; GFX942-NEXT:    v_mov_b64_e32 v[6:7], v[2:3]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -2452,8 +2413,7 @@ define void @v_shuffle_v2i64_v8i64__3_1(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v8, v2
-; GFX942-NEXT:    v_mov_b32_e32 v9, v3
+; GFX942-NEXT:    v_mov_b64_e32 v[8:9], v[2:3]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[6:9], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -2497,8 +2457,7 @@ define void @v_shuffle_v2i64_v8i64__4_1(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v10, v2
-; GFX942-NEXT:    v_mov_b32_e32 v11, v3
+; GFX942-NEXT:    v_mov_b64_e32 v[10:11], v[2:3]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -2542,8 +2501,7 @@ define void @v_shuffle_v2i64_v8i64__5_1(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v12, v2
-; GFX942-NEXT:    v_mov_b32_e32 v13, v3
+; GFX942-NEXT:    v_mov_b64_e32 v[12:13], v[2:3]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[10:13], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -2587,8 +2545,7 @@ define void @v_shuffle_v2i64_v8i64__6_1(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v14, v2
-; GFX942-NEXT:    v_mov_b32_e32 v15, v3
+; GFX942-NEXT:    v_mov_b64_e32 v[14:15], v[2:3]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[12:15], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -2632,8 +2589,7 @@ define void @v_shuffle_v2i64_v8i64__7_1(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, v14
-; GFX942-NEXT:    v_mov_b32_e32 v1, v15
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], v[14:15]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -2726,8 +2682,7 @@ define void @v_shuffle_v2i64_v8i64__9_1(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[4:19]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v8, v2
-; GFX942-NEXT:    v_mov_b32_e32 v9, v3
+; GFX942-NEXT:    v_mov_b64_e32 v[8:9], v[2:3]
 ; GFX942-NEXT:    global_store_dwordx4 v20, v[6:9], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -2782,8 +2737,7 @@ define void @v_shuffle_v2i64_v8i64__10_1(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[4:19]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v10, v2
-; GFX942-NEXT:    v_mov_b32_e32 v11, v3
+; GFX942-NEXT:    v_mov_b64_e32 v[10:11], v[2:3]
 ; GFX942-NEXT:    global_store_dwordx4 v20, v[8:11], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -2838,8 +2792,7 @@ define void @v_shuffle_v2i64_v8i64__11_1(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[4:19]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v12, v2
-; GFX942-NEXT:    v_mov_b32_e32 v13, v3
+; GFX942-NEXT:    v_mov_b64_e32 v[12:13], v[2:3]
 ; GFX942-NEXT:    global_store_dwordx4 v20, v[10:13], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -2894,8 +2847,7 @@ define void @v_shuffle_v2i64_v8i64__12_1(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[4:19]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v14, v2
-; GFX942-NEXT:    v_mov_b32_e32 v15, v3
+; GFX942-NEXT:    v_mov_b64_e32 v[14:15], v[2:3]
 ; GFX942-NEXT:    global_store_dwordx4 v20, v[12:15], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -2950,8 +2902,7 @@ define void @v_shuffle_v2i64_v8i64__13_1(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[4:19]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v16, v2
-; GFX942-NEXT:    v_mov_b32_e32 v17, v3
+; GFX942-NEXT:    v_mov_b64_e32 v[16:17], v[2:3]
 ; GFX942-NEXT:    global_store_dwordx4 v20, v[14:17], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -3006,8 +2957,7 @@ define void @v_shuffle_v2i64_v8i64__14_1(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[4:19]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v18, v2
-; GFX942-NEXT:    v_mov_b32_e32 v19, v3
+; GFX942-NEXT:    v_mov_b64_e32 v[18:19], v[2:3]
 ; GFX942-NEXT:    global_store_dwordx4 v20, v[16:19], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -3091,8 +3041,7 @@ define void @v_shuffle_v2i64_v8i64__0_2(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v4
-; GFX942-NEXT:    v_mov_b32_e32 v3, v5
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[4:5]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -3175,8 +3124,7 @@ define void @v_shuffle_v2i64_v8i64__2_2(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v6, v4
-; GFX942-NEXT:    v_mov_b32_e32 v7, v5
+; GFX942-NEXT:    v_mov_b64_e32 v[6:7], v[4:5]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -3220,8 +3168,7 @@ define void @v_shuffle_v2i64_v8i64__3_2(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v8, v4
-; GFX942-NEXT:    v_mov_b32_e32 v9, v5
+; GFX942-NEXT:    v_mov_b64_e32 v[8:9], v[4:5]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[6:9], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -3265,8 +3212,7 @@ define void @v_shuffle_v2i64_v8i64__4_2(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v10, v4
-; GFX942-NEXT:    v_mov_b32_e32 v11, v5
+; GFX942-NEXT:    v_mov_b64_e32 v[10:11], v[4:5]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -3310,8 +3256,7 @@ define void @v_shuffle_v2i64_v8i64__5_2(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v12, v4
-; GFX942-NEXT:    v_mov_b32_e32 v13, v5
+; GFX942-NEXT:    v_mov_b64_e32 v[12:13], v[4:5]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[10:13], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -3355,8 +3300,7 @@ define void @v_shuffle_v2i64_v8i64__6_2(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v14, v4
-; GFX942-NEXT:    v_mov_b32_e32 v15, v5
+; GFX942-NEXT:    v_mov_b64_e32 v[14:15], v[4:5]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[12:15], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -3400,8 +3344,7 @@ define void @v_shuffle_v2i64_v8i64__7_2(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v14
-; GFX942-NEXT:    v_mov_b32_e32 v3, v15
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[14:15]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -3494,8 +3437,7 @@ define void @v_shuffle_v2i64_v8i64__9_2(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[6:21]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v10, v4
-; GFX942-NEXT:    v_mov_b32_e32 v11, v5
+; GFX942-NEXT:    v_mov_b64_e32 v[10:11], v[4:5]
 ; GFX942-NEXT:    global_store_dwordx4 v22, v[8:11], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -3550,8 +3492,7 @@ define void @v_shuffle_v2i64_v8i64__10_2(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[6:21]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v12, v4
-; GFX942-NEXT:    v_mov_b32_e32 v13, v5
+; GFX942-NEXT:    v_mov_b64_e32 v[12:13], v[4:5]
 ; GFX942-NEXT:    global_store_dwordx4 v22, v[10:13], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -3606,8 +3547,7 @@ define void @v_shuffle_v2i64_v8i64__11_2(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[6:21]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v14, v4
-; GFX942-NEXT:    v_mov_b32_e32 v15, v5
+; GFX942-NEXT:    v_mov_b64_e32 v[14:15], v[4:5]
 ; GFX942-NEXT:    global_store_dwordx4 v22, v[12:15], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -3662,8 +3602,7 @@ define void @v_shuffle_v2i64_v8i64__12_2(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[6:21]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v16, v4
-; GFX942-NEXT:    v_mov_b32_e32 v17, v5
+; GFX942-NEXT:    v_mov_b64_e32 v[16:17], v[4:5]
 ; GFX942-NEXT:    global_store_dwordx4 v22, v[14:17], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -3718,8 +3657,7 @@ define void @v_shuffle_v2i64_v8i64__13_2(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[6:21]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v18, v4
-; GFX942-NEXT:    v_mov_b32_e32 v19, v5
+; GFX942-NEXT:    v_mov_b64_e32 v[18:19], v[4:5]
 ; GFX942-NEXT:    global_store_dwordx4 v22, v[16:19], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -3774,8 +3712,7 @@ define void @v_shuffle_v2i64_v8i64__14_2(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[6:21]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v20, v4
-; GFX942-NEXT:    v_mov_b32_e32 v21, v5
+; GFX942-NEXT:    v_mov_b64_e32 v[20:21], v[4:5]
 ; GFX942-NEXT:    global_store_dwordx4 v22, v[18:21], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -3859,8 +3796,7 @@ define void @v_shuffle_v2i64_v8i64__0_3(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v6
-; GFX942-NEXT:    v_mov_b32_e32 v3, v7
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[6:7]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -3904,8 +3840,7 @@ define void @v_shuffle_v2i64_v8i64__1_3(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v4, v6
-; GFX942-NEXT:    v_mov_b32_e32 v5, v7
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[6:7]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -3988,8 +3923,7 @@ define void @v_shuffle_v2i64_v8i64__3_3(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v8, v6
-; GFX942-NEXT:    v_mov_b32_e32 v9, v7
+; GFX942-NEXT:    v_mov_b64_e32 v[8:9], v[6:7]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[6:9], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -4033,8 +3967,7 @@ define void @v_shuffle_v2i64_v8i64__4_3(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v10, v6
-; GFX942-NEXT:    v_mov_b32_e32 v11, v7
+; GFX942-NEXT:    v_mov_b64_e32 v[10:11], v[6:7]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -4078,8 +4011,7 @@ define void @v_shuffle_v2i64_v8i64__5_3(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v12, v6
-; GFX942-NEXT:    v_mov_b32_e32 v13, v7
+; GFX942-NEXT:    v_mov_b64_e32 v[12:13], v[6:7]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[10:13], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -4123,8 +4055,7 @@ define void @v_shuffle_v2i64_v8i64__6_3(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v14, v6
-; GFX942-NEXT:    v_mov_b32_e32 v15, v7
+; GFX942-NEXT:    v_mov_b64_e32 v[14:15], v[6:7]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[12:15], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -4168,8 +4099,7 @@ define void @v_shuffle_v2i64_v8i64__7_3(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v4, v14
-; GFX942-NEXT:    v_mov_b32_e32 v5, v15
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[14:15]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -4262,8 +4192,7 @@ define void @v_shuffle_v2i64_v8i64__9_3(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[8:23]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v12, v6
-; GFX942-NEXT:    v_mov_b32_e32 v13, v7
+; GFX942-NEXT:    v_mov_b64_e32 v[12:13], v[6:7]
 ; GFX942-NEXT:    global_store_dwordx4 v24, v[10:13], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -4318,8 +4247,7 @@ define void @v_shuffle_v2i64_v8i64__10_3(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[8:23]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v14, v6
-; GFX942-NEXT:    v_mov_b32_e32 v15, v7
+; GFX942-NEXT:    v_mov_b64_e32 v[14:15], v[6:7]
 ; GFX942-NEXT:    global_store_dwordx4 v24, v[12:15], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -4374,8 +4302,7 @@ define void @v_shuffle_v2i64_v8i64__11_3(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[8:23]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v16, v6
-; GFX942-NEXT:    v_mov_b32_e32 v17, v7
+; GFX942-NEXT:    v_mov_b64_e32 v[16:17], v[6:7]
 ; GFX942-NEXT:    global_store_dwordx4 v24, v[14:17], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -4430,8 +4357,7 @@ define void @v_shuffle_v2i64_v8i64__12_3(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[8:23]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v18, v6
-; GFX942-NEXT:    v_mov_b32_e32 v19, v7
+; GFX942-NEXT:    v_mov_b64_e32 v[18:19], v[6:7]
 ; GFX942-NEXT:    global_store_dwordx4 v24, v[16:19], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -4486,8 +4412,7 @@ define void @v_shuffle_v2i64_v8i64__13_3(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[8:23]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v20, v6
-; GFX942-NEXT:    v_mov_b32_e32 v21, v7
+; GFX942-NEXT:    v_mov_b64_e32 v[20:21], v[6:7]
 ; GFX942-NEXT:    global_store_dwordx4 v24, v[18:21], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -4542,8 +4467,7 @@ define void @v_shuffle_v2i64_v8i64__14_3(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[8:23]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v22, v6
-; GFX942-NEXT:    v_mov_b32_e32 v23, v7
+; GFX942-NEXT:    v_mov_b64_e32 v[22:23], v[6:7]
 ; GFX942-NEXT:    global_store_dwordx4 v24, v[20:23], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -4627,8 +4551,7 @@ define void @v_shuffle_v2i64_v8i64__0_4(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v8
-; GFX942-NEXT:    v_mov_b32_e32 v3, v9
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[8:9]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -4672,8 +4595,7 @@ define void @v_shuffle_v2i64_v8i64__1_4(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v4, v8
-; GFX942-NEXT:    v_mov_b32_e32 v5, v9
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[8:9]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -4717,8 +4639,7 @@ define void @v_shuffle_v2i64_v8i64__2_4(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v6, v8
-; GFX942-NEXT:    v_mov_b32_e32 v7, v9
+; GFX942-NEXT:    v_mov_b64_e32 v[6:7], v[8:9]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -4801,8 +4722,7 @@ define void @v_shuffle_v2i64_v8i64__4_4(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v10, v8
-; GFX942-NEXT:    v_mov_b32_e32 v11, v9
+; GFX942-NEXT:    v_mov_b64_e32 v[10:11], v[8:9]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -4846,8 +4766,7 @@ define void @v_shuffle_v2i64_v8i64__5_4(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v12, v8
-; GFX942-NEXT:    v_mov_b32_e32 v13, v9
+; GFX942-NEXT:    v_mov_b64_e32 v[12:13], v[8:9]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[10:13], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -4891,8 +4810,7 @@ define void @v_shuffle_v2i64_v8i64__6_4(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v14, v8
-; GFX942-NEXT:    v_mov_b32_e32 v15, v9
+; GFX942-NEXT:    v_mov_b64_e32 v[14:15], v[8:9]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[12:15], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -4936,8 +4854,7 @@ define void @v_shuffle_v2i64_v8i64__7_4(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v6, v14
-; GFX942-NEXT:    v_mov_b32_e32 v7, v15
+; GFX942-NEXT:    v_mov_b64_e32 v[6:7], v[14:15]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[6:9], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -5030,8 +4947,7 @@ define void @v_shuffle_v2i64_v8i64__9_4(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[10:25]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v14, v8
-; GFX942-NEXT:    v_mov_b32_e32 v15, v9
+; GFX942-NEXT:    v_mov_b64_e32 v[14:15], v[8:9]
 ; GFX942-NEXT:    global_store_dwordx4 v26, v[12:15], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -5086,8 +5002,7 @@ define void @v_shuffle_v2i64_v8i64__10_4(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[10:25]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v16, v8
-; GFX942-NEXT:    v_mov_b32_e32 v17, v9
+; GFX942-NEXT:    v_mov_b64_e32 v[16:17], v[8:9]
 ; GFX942-NEXT:    global_store_dwordx4 v26, v[14:17], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -5142,8 +5057,7 @@ define void @v_shuffle_v2i64_v8i64__11_4(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[10:25]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v18, v8
-; GFX942-NEXT:    v_mov_b32_e32 v19, v9
+; GFX942-NEXT:    v_mov_b64_e32 v[18:19], v[8:9]
 ; GFX942-NEXT:    global_store_dwordx4 v26, v[16:19], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -5198,8 +5112,7 @@ define void @v_shuffle_v2i64_v8i64__12_4(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[10:25]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v20, v8
-; GFX942-NEXT:    v_mov_b32_e32 v21, v9
+; GFX942-NEXT:    v_mov_b64_e32 v[20:21], v[8:9]
 ; GFX942-NEXT:    global_store_dwordx4 v26, v[18:21], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -5254,8 +5167,7 @@ define void @v_shuffle_v2i64_v8i64__13_4(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[10:25]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v22, v8
-; GFX942-NEXT:    v_mov_b32_e32 v23, v9
+; GFX942-NEXT:    v_mov_b64_e32 v[22:23], v[8:9]
 ; GFX942-NEXT:    global_store_dwordx4 v26, v[20:23], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -5310,8 +5222,7 @@ define void @v_shuffle_v2i64_v8i64__14_4(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[10:25]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v24, v8
-; GFX942-NEXT:    v_mov_b32_e32 v25, v9
+; GFX942-NEXT:    v_mov_b64_e32 v[24:25], v[8:9]
 ; GFX942-NEXT:    global_store_dwordx4 v26, v[22:25], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -5395,8 +5306,7 @@ define void @v_shuffle_v2i64_v8i64__0_5(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v10
-; GFX942-NEXT:    v_mov_b32_e32 v3, v11
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[10:11]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -5440,8 +5350,7 @@ define void @v_shuffle_v2i64_v8i64__1_5(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v4, v10
-; GFX942-NEXT:    v_mov_b32_e32 v5, v11
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[10:11]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -5485,8 +5394,7 @@ define void @v_shuffle_v2i64_v8i64__2_5(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v6, v10
-; GFX942-NEXT:    v_mov_b32_e32 v7, v11
+; GFX942-NEXT:    v_mov_b64_e32 v[6:7], v[10:11]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -5530,8 +5438,7 @@ define void @v_shuffle_v2i64_v8i64__3_5(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v8, v10
-; GFX942-NEXT:    v_mov_b32_e32 v9, v11
+; GFX942-NEXT:    v_mov_b64_e32 v[8:9], v[10:11]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[6:9], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -5614,8 +5521,7 @@ define void @v_shuffle_v2i64_v8i64__5_5(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v12, v10
-; GFX942-NEXT:    v_mov_b32_e32 v13, v11
+; GFX942-NEXT:    v_mov_b64_e32 v[12:13], v[10:11]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[10:13], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -5659,8 +5565,7 @@ define void @v_shuffle_v2i64_v8i64__6_5(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v14, v10
-; GFX942-NEXT:    v_mov_b32_e32 v15, v11
+; GFX942-NEXT:    v_mov_b64_e32 v[14:15], v[10:11]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[12:15], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -5704,8 +5609,7 @@ define void @v_shuffle_v2i64_v8i64__7_5(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v8, v14
-; GFX942-NEXT:    v_mov_b32_e32 v9, v15
+; GFX942-NEXT:    v_mov_b64_e32 v[8:9], v[14:15]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -5798,8 +5702,7 @@ define void @v_shuffle_v2i64_v8i64__9_5(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[12:27]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v16, v10
-; GFX942-NEXT:    v_mov_b32_e32 v17, v11
+; GFX942-NEXT:    v_mov_b64_e32 v[16:17], v[10:11]
 ; GFX942-NEXT:    global_store_dwordx4 v28, v[14:17], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -5854,8 +5757,7 @@ define void @v_shuffle_v2i64_v8i64__10_5(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[12:27]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v18, v10
-; GFX942-NEXT:    v_mov_b32_e32 v19, v11
+; GFX942-NEXT:    v_mov_b64_e32 v[18:19], v[10:11]
 ; GFX942-NEXT:    global_store_dwordx4 v28, v[16:19], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -5910,8 +5812,7 @@ define void @v_shuffle_v2i64_v8i64__11_5(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[12:27]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v20, v10
-; GFX942-NEXT:    v_mov_b32_e32 v21, v11
+; GFX942-NEXT:    v_mov_b64_e32 v[20:21], v[10:11]
 ; GFX942-NEXT:    global_store_dwordx4 v28, v[18:21], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -5966,8 +5867,7 @@ define void @v_shuffle_v2i64_v8i64__12_5(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[12:27]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v22, v10
-; GFX942-NEXT:    v_mov_b32_e32 v23, v11
+; GFX942-NEXT:    v_mov_b64_e32 v[22:23], v[10:11]
 ; GFX942-NEXT:    global_store_dwordx4 v28, v[20:23], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -6022,8 +5922,7 @@ define void @v_shuffle_v2i64_v8i64__13_5(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[12:27]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v24, v10
-; GFX942-NEXT:    v_mov_b32_e32 v25, v11
+; GFX942-NEXT:    v_mov_b64_e32 v[24:25], v[10:11]
 ; GFX942-NEXT:    global_store_dwordx4 v28, v[22:25], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -6078,8 +5977,7 @@ define void @v_shuffle_v2i64_v8i64__14_5(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[12:27]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v26, v10
-; GFX942-NEXT:    v_mov_b32_e32 v27, v11
+; GFX942-NEXT:    v_mov_b64_e32 v[26:27], v[10:11]
 ; GFX942-NEXT:    global_store_dwordx4 v28, v[24:27], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -6163,8 +6061,7 @@ define void @v_shuffle_v2i64_v8i64__0_6(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v12
-; GFX942-NEXT:    v_mov_b32_e32 v3, v13
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[12:13]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -6208,8 +6105,7 @@ define void @v_shuffle_v2i64_v8i64__1_6(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v4, v12
-; GFX942-NEXT:    v_mov_b32_e32 v5, v13
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[12:13]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -6253,8 +6149,7 @@ define void @v_shuffle_v2i64_v8i64__2_6(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v6, v12
-; GFX942-NEXT:    v_mov_b32_e32 v7, v13
+; GFX942-NEXT:    v_mov_b64_e32 v[6:7], v[12:13]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -6298,8 +6193,7 @@ define void @v_shuffle_v2i64_v8i64__3_6(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v8, v12
-; GFX942-NEXT:    v_mov_b32_e32 v9, v13
+; GFX942-NEXT:    v_mov_b64_e32 v[8:9], v[12:13]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[6:9], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -6343,8 +6237,7 @@ define void @v_shuffle_v2i64_v8i64__4_6(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v10, v12
-; GFX942-NEXT:    v_mov_b32_e32 v11, v13
+; GFX942-NEXT:    v_mov_b64_e32 v[10:11], v[12:13]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -6427,8 +6320,7 @@ define void @v_shuffle_v2i64_v8i64__6_6(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v14, v12
-; GFX942-NEXT:    v_mov_b32_e32 v15, v13
+; GFX942-NEXT:    v_mov_b64_e32 v[14:15], v[12:13]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[12:15], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -6472,8 +6364,7 @@ define void @v_shuffle_v2i64_v8i64__7_6(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v10, v14
-; GFX942-NEXT:    v_mov_b32_e32 v11, v15
+; GFX942-NEXT:    v_mov_b64_e32 v[10:11], v[14:15]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[10:13], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -6566,8 +6457,7 @@ define void @v_shuffle_v2i64_v8i64__9_6(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[14:29]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v18, v12
-; GFX942-NEXT:    v_mov_b32_e32 v19, v13
+; GFX942-NEXT:    v_mov_b64_e32 v[18:19], v[12:13]
 ; GFX942-NEXT:    global_store_dwordx4 v30, v[16:19], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -6622,8 +6512,7 @@ define void @v_shuffle_v2i64_v8i64__10_6(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[14:29]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v20, v12
-; GFX942-NEXT:    v_mov_b32_e32 v21, v13
+; GFX942-NEXT:    v_mov_b64_e32 v[20:21], v[12:13]
 ; GFX942-NEXT:    global_store_dwordx4 v30, v[18:21], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -6678,8 +6567,7 @@ define void @v_shuffle_v2i64_v8i64__11_6(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[14:29]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v22, v12
-; GFX942-NEXT:    v_mov_b32_e32 v23, v13
+; GFX942-NEXT:    v_mov_b64_e32 v[22:23], v[12:13]
 ; GFX942-NEXT:    global_store_dwordx4 v30, v[20:23], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -6734,8 +6622,7 @@ define void @v_shuffle_v2i64_v8i64__12_6(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[14:29]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v24, v12
-; GFX942-NEXT:    v_mov_b32_e32 v25, v13
+; GFX942-NEXT:    v_mov_b64_e32 v[24:25], v[12:13]
 ; GFX942-NEXT:    global_store_dwordx4 v30, v[22:25], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -6790,8 +6677,7 @@ define void @v_shuffle_v2i64_v8i64__13_6(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[14:29]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v26, v12
-; GFX942-NEXT:    v_mov_b32_e32 v27, v13
+; GFX942-NEXT:    v_mov_b64_e32 v[26:27], v[12:13]
 ; GFX942-NEXT:    global_store_dwordx4 v30, v[24:27], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -6846,8 +6732,7 @@ define void @v_shuffle_v2i64_v8i64__14_6(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[14:29]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v28, v12
-; GFX942-NEXT:    v_mov_b32_e32 v29, v13
+; GFX942-NEXT:    v_mov_b64_e32 v[28:29], v[12:13]
 ; GFX942-NEXT:    global_store_dwordx4 v30, v[26:29], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -6931,8 +6816,7 @@ define void @v_shuffle_v2i64_v8i64__0_7(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v14
-; GFX942-NEXT:    v_mov_b32_e32 v3, v15
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[14:15]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -6976,8 +6860,7 @@ define void @v_shuffle_v2i64_v8i64__1_7(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v4, v14
-; GFX942-NEXT:    v_mov_b32_e32 v5, v15
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[14:15]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -7021,8 +6904,7 @@ define void @v_shuffle_v2i64_v8i64__2_7(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v6, v14
-; GFX942-NEXT:    v_mov_b32_e32 v7, v15
+; GFX942-NEXT:    v_mov_b64_e32 v[6:7], v[14:15]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -7066,8 +6948,7 @@ define void @v_shuffle_v2i64_v8i64__3_7(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v8, v14
-; GFX942-NEXT:    v_mov_b32_e32 v9, v15
+; GFX942-NEXT:    v_mov_b64_e32 v[8:9], v[14:15]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[6:9], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -7111,8 +6992,7 @@ define void @v_shuffle_v2i64_v8i64__4_7(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v10, v14
-; GFX942-NEXT:    v_mov_b32_e32 v11, v15
+; GFX942-NEXT:    v_mov_b64_e32 v[10:11], v[14:15]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -7156,8 +7036,7 @@ define void @v_shuffle_v2i64_v8i64__5_7(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v12, v14
-; GFX942-NEXT:    v_mov_b32_e32 v13, v15
+; GFX942-NEXT:    v_mov_b64_e32 v[12:13], v[14:15]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[10:13], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -7240,8 +7119,7 @@ define void @v_shuffle_v2i64_v8i64__7_7(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v12, v14
-; GFX942-NEXT:    v_mov_b32_e32 v13, v15
+; GFX942-NEXT:    v_mov_b64_e32 v[12:13], v[14:15]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[12:15], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -7334,8 +7212,7 @@ define void @v_shuffle_v2i64_v8i64__9_7(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v20, v14
-; GFX942-NEXT:    v_mov_b32_e32 v21, v15
+; GFX942-NEXT:    v_mov_b64_e32 v[20:21], v[14:15]
 ; GFX942-NEXT:    global_store_dwordx4 v32, v[18:21], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -7390,8 +7267,7 @@ define void @v_shuffle_v2i64_v8i64__10_7(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v22, v14
-; GFX942-NEXT:    v_mov_b32_e32 v23, v15
+; GFX942-NEXT:    v_mov_b64_e32 v[22:23], v[14:15]
 ; GFX942-NEXT:    global_store_dwordx4 v32, v[20:23], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -7446,8 +7322,7 @@ define void @v_shuffle_v2i64_v8i64__11_7(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v24, v14
-; GFX942-NEXT:    v_mov_b32_e32 v25, v15
+; GFX942-NEXT:    v_mov_b64_e32 v[24:25], v[14:15]
 ; GFX942-NEXT:    global_store_dwordx4 v32, v[22:25], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -7502,8 +7377,7 @@ define void @v_shuffle_v2i64_v8i64__12_7(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v26, v14
-; GFX942-NEXT:    v_mov_b32_e32 v27, v15
+; GFX942-NEXT:    v_mov_b64_e32 v[26:27], v[14:15]
 ; GFX942-NEXT:    global_store_dwordx4 v32, v[24:27], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -7558,8 +7432,7 @@ define void @v_shuffle_v2i64_v8i64__13_7(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v28, v14
-; GFX942-NEXT:    v_mov_b32_e32 v29, v15
+; GFX942-NEXT:    v_mov_b64_e32 v[28:29], v[14:15]
 ; GFX942-NEXT:    global_store_dwordx4 v32, v[26:29], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -7614,8 +7487,7 @@ define void @v_shuffle_v2i64_v8i64__14_7(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v30, v14
-; GFX942-NEXT:    v_mov_b32_e32 v31, v15
+; GFX942-NEXT:    v_mov_b64_e32 v[30:31], v[14:15]
 ; GFX942-NEXT:    global_store_dwordx4 v32, v[28:31], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -7944,8 +7816,7 @@ define void @v_shuffle_v2i64_v8i64__7_8(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, v14
-; GFX942-NEXT:    v_mov_b32_e32 v1, v15
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], v[14:15]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -8000,8 +7871,7 @@ define void @v_shuffle_v2i64_v8i64__9_8(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v4, v0
-; GFX942-NEXT:    v_mov_b32_e32 v5, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -8046,8 +7916,7 @@ define void @v_shuffle_v2i64_v8i64__10_8(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v6, v0
-; GFX942-NEXT:    v_mov_b32_e32 v7, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[6:7], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -8092,8 +7961,7 @@ define void @v_shuffle_v2i64_v8i64__11_8(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v8, v0
-; GFX942-NEXT:    v_mov_b32_e32 v9, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[8:9], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[6:9], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -8138,8 +8006,7 @@ define void @v_shuffle_v2i64_v8i64__12_8(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v10, v0
-; GFX942-NEXT:    v_mov_b32_e32 v11, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[10:11], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -8184,8 +8051,7 @@ define void @v_shuffle_v2i64_v8i64__13_8(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v12, v0
-; GFX942-NEXT:    v_mov_b32_e32 v13, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[12:13], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[10:13], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -8230,8 +8096,7 @@ define void @v_shuffle_v2i64_v8i64__14_8(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v14, v0
-; GFX942-NEXT:    v_mov_b32_e32 v15, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[14:15], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[12:15], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -8326,8 +8191,7 @@ define void @v_shuffle_v2i64_v8i64__0_9(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[2:17]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v4
-; GFX942-NEXT:    v_mov_b32_e32 v3, v5
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[4:5]
 ; GFX942-NEXT:    global_store_dwordx4 v18, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -8382,8 +8246,7 @@ define void @v_shuffle_v2i64_v8i64__1_9(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[4:19]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v4, v6
-; GFX942-NEXT:    v_mov_b32_e32 v5, v7
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[6:7]
 ; GFX942-NEXT:    global_store_dwordx4 v20, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -8438,8 +8301,7 @@ define void @v_shuffle_v2i64_v8i64__2_9(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[6:21]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v6, v8
-; GFX942-NEXT:    v_mov_b32_e32 v7, v9
+; GFX942-NEXT:    v_mov_b64_e32 v[6:7], v[8:9]
 ; GFX942-NEXT:    global_store_dwordx4 v22, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -8494,8 +8356,7 @@ define void @v_shuffle_v2i64_v8i64__3_9(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[8:23]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v8, v10
-; GFX942-NEXT:    v_mov_b32_e32 v9, v11
+; GFX942-NEXT:    v_mov_b64_e32 v[8:9], v[10:11]
 ; GFX942-NEXT:    global_store_dwordx4 v24, v[6:9], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -8550,8 +8411,7 @@ define void @v_shuffle_v2i64_v8i64__4_9(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[10:25]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v10, v12
-; GFX942-NEXT:    v_mov_b32_e32 v11, v13
+; GFX942-NEXT:    v_mov_b64_e32 v[10:11], v[12:13]
 ; GFX942-NEXT:    global_store_dwordx4 v26, v[8:11], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -8606,8 +8466,7 @@ define void @v_shuffle_v2i64_v8i64__5_9(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[12:27]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v12, v14
-; GFX942-NEXT:    v_mov_b32_e32 v13, v15
+; GFX942-NEXT:    v_mov_b64_e32 v[12:13], v[14:15]
 ; GFX942-NEXT:    global_store_dwordx4 v28, v[10:13], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -8662,8 +8521,7 @@ define void @v_shuffle_v2i64_v8i64__6_9(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[14:29]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v14, v16
-; GFX942-NEXT:    v_mov_b32_e32 v15, v17
+; GFX942-NEXT:    v_mov_b64_e32 v[14:15], v[16:17]
 ; GFX942-NEXT:    global_store_dwordx4 v30, v[12:15], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -8718,8 +8576,7 @@ define void @v_shuffle_v2i64_v8i64__7_9(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, v16
-; GFX942-NEXT:    v_mov_b32_e32 v1, v17
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], v[16:17]
 ; GFX942-NEXT:    global_store_dwordx4 v18, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -8804,8 +8661,7 @@ define void @v_shuffle_v2i64_v8i64__9_9(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v4, v2
-; GFX942-NEXT:    v_mov_b32_e32 v5, v3
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[2:3]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -8850,8 +8706,7 @@ define void @v_shuffle_v2i64_v8i64__10_9(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v6, v2
-; GFX942-NEXT:    v_mov_b32_e32 v7, v3
+; GFX942-NEXT:    v_mov_b64_e32 v[6:7], v[2:3]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -8896,8 +8751,7 @@ define void @v_shuffle_v2i64_v8i64__11_9(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v8, v2
-; GFX942-NEXT:    v_mov_b32_e32 v9, v3
+; GFX942-NEXT:    v_mov_b64_e32 v[8:9], v[2:3]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[6:9], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -8942,8 +8796,7 @@ define void @v_shuffle_v2i64_v8i64__12_9(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v10, v2
-; GFX942-NEXT:    v_mov_b32_e32 v11, v3
+; GFX942-NEXT:    v_mov_b64_e32 v[10:11], v[2:3]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -8988,8 +8841,7 @@ define void @v_shuffle_v2i64_v8i64__13_9(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v12, v2
-; GFX942-NEXT:    v_mov_b32_e32 v13, v3
+; GFX942-NEXT:    v_mov_b64_e32 v[12:13], v[2:3]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[10:13], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -9034,8 +8886,7 @@ define void @v_shuffle_v2i64_v8i64__14_9(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v14, v2
-; GFX942-NEXT:    v_mov_b32_e32 v15, v3
+; GFX942-NEXT:    v_mov_b64_e32 v[14:15], v[2:3]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[12:15], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -9130,8 +8981,7 @@ define void @v_shuffle_v2i64_v8i64__0_10(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[2:17]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v6
-; GFX942-NEXT:    v_mov_b32_e32 v3, v7
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[6:7]
 ; GFX942-NEXT:    global_store_dwordx4 v18, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -9186,8 +9036,7 @@ define void @v_shuffle_v2i64_v8i64__1_10(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[4:19]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v4, v8
-; GFX942-NEXT:    v_mov_b32_e32 v5, v9
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[8:9]
 ; GFX942-NEXT:    global_store_dwordx4 v20, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -9242,8 +9091,7 @@ define void @v_shuffle_v2i64_v8i64__2_10(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[6:21]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v6, v10
-; GFX942-NEXT:    v_mov_b32_e32 v7, v11
+; GFX942-NEXT:    v_mov_b64_e32 v[6:7], v[10:11]
 ; GFX942-NEXT:    global_store_dwordx4 v22, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -9298,8 +9146,7 @@ define void @v_shuffle_v2i64_v8i64__3_10(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[8:23]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v8, v12
-; GFX942-NEXT:    v_mov_b32_e32 v9, v13
+; GFX942-NEXT:    v_mov_b64_e32 v[8:9], v[12:13]
 ; GFX942-NEXT:    global_store_dwordx4 v24, v[6:9], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -9354,8 +9201,7 @@ define void @v_shuffle_v2i64_v8i64__4_10(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[10:25]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v10, v14
-; GFX942-NEXT:    v_mov_b32_e32 v11, v15
+; GFX942-NEXT:    v_mov_b64_e32 v[10:11], v[14:15]
 ; GFX942-NEXT:    global_store_dwordx4 v26, v[8:11], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -9410,8 +9256,7 @@ define void @v_shuffle_v2i64_v8i64__5_10(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[12:27]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v12, v16
-; GFX942-NEXT:    v_mov_b32_e32 v13, v17
+; GFX942-NEXT:    v_mov_b64_e32 v[12:13], v[16:17]
 ; GFX942-NEXT:    global_store_dwordx4 v28, v[10:13], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -9466,8 +9311,7 @@ define void @v_shuffle_v2i64_v8i64__6_10(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[14:29]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v14, v18
-; GFX942-NEXT:    v_mov_b32_e32 v15, v19
+; GFX942-NEXT:    v_mov_b64_e32 v[14:15], v[18:19]
 ; GFX942-NEXT:    global_store_dwordx4 v30, v[12:15], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -9522,8 +9366,7 @@ define void @v_shuffle_v2i64_v8i64__7_10(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v16
-; GFX942-NEXT:    v_mov_b32_e32 v3, v17
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[16:17]
 ; GFX942-NEXT:    global_store_dwordx4 v18, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -9568,8 +9411,7 @@ define void @v_shuffle_v2i64_v8i64__8_10(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v4
-; GFX942-NEXT:    v_mov_b32_e32 v3, v5
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[4:5]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -9654,8 +9496,7 @@ define void @v_shuffle_v2i64_v8i64__10_10(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v6, v4
-; GFX942-NEXT:    v_mov_b32_e32 v7, v5
+; GFX942-NEXT:    v_mov_b64_e32 v[6:7], v[4:5]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -9700,8 +9541,7 @@ define void @v_shuffle_v2i64_v8i64__11_10(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v8, v4
-; GFX942-NEXT:    v_mov_b32_e32 v9, v5
+; GFX942-NEXT:    v_mov_b64_e32 v[8:9], v[4:5]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[6:9], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -9746,8 +9586,7 @@ define void @v_shuffle_v2i64_v8i64__12_10(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v10, v4
-; GFX942-NEXT:    v_mov_b32_e32 v11, v5
+; GFX942-NEXT:    v_mov_b64_e32 v[10:11], v[4:5]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -9792,8 +9631,7 @@ define void @v_shuffle_v2i64_v8i64__13_10(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v12, v4
-; GFX942-NEXT:    v_mov_b32_e32 v13, v5
+; GFX942-NEXT:    v_mov_b64_e32 v[12:13], v[4:5]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[10:13], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -9838,8 +9676,7 @@ define void @v_shuffle_v2i64_v8i64__14_10(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v14, v4
-; GFX942-NEXT:    v_mov_b32_e32 v15, v5
+; GFX942-NEXT:    v_mov_b64_e32 v[14:15], v[4:5]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[12:15], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -9934,8 +9771,7 @@ define void @v_shuffle_v2i64_v8i64__0_11(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[2:17]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v8
-; GFX942-NEXT:    v_mov_b32_e32 v3, v9
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[8:9]
 ; GFX942-NEXT:    global_store_dwordx4 v18, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -9990,8 +9826,7 @@ define void @v_shuffle_v2i64_v8i64__1_11(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[4:19]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v4, v10
-; GFX942-NEXT:    v_mov_b32_e32 v5, v11
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[10:11]
 ; GFX942-NEXT:    global_store_dwordx4 v20, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -10046,8 +9881,7 @@ define void @v_shuffle_v2i64_v8i64__2_11(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[6:21]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v6, v12
-; GFX942-NEXT:    v_mov_b32_e32 v7, v13
+; GFX942-NEXT:    v_mov_b64_e32 v[6:7], v[12:13]
 ; GFX942-NEXT:    global_store_dwordx4 v22, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -10102,8 +9936,7 @@ define void @v_shuffle_v2i64_v8i64__3_11(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[8:23]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v8, v14
-; GFX942-NEXT:    v_mov_b32_e32 v9, v15
+; GFX942-NEXT:    v_mov_b64_e32 v[8:9], v[14:15]
 ; GFX942-NEXT:    global_store_dwordx4 v24, v[6:9], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -10158,8 +9991,7 @@ define void @v_shuffle_v2i64_v8i64__4_11(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[10:25]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v10, v16
-; GFX942-NEXT:    v_mov_b32_e32 v11, v17
+; GFX942-NEXT:    v_mov_b64_e32 v[10:11], v[16:17]
 ; GFX942-NEXT:    global_store_dwordx4 v26, v[8:11], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -10214,8 +10046,7 @@ define void @v_shuffle_v2i64_v8i64__5_11(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[12:27]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v12, v18
-; GFX942-NEXT:    v_mov_b32_e32 v13, v19
+; GFX942-NEXT:    v_mov_b64_e32 v[12:13], v[18:19]
 ; GFX942-NEXT:    global_store_dwordx4 v28, v[10:13], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -10270,8 +10101,7 @@ define void @v_shuffle_v2i64_v8i64__6_11(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[14:29]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v14, v20
-; GFX942-NEXT:    v_mov_b32_e32 v15, v21
+; GFX942-NEXT:    v_mov_b64_e32 v[14:15], v[20:21]
 ; GFX942-NEXT:    global_store_dwordx4 v30, v[12:15], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -10326,8 +10156,7 @@ define void @v_shuffle_v2i64_v8i64__7_11(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v4, v16
-; GFX942-NEXT:    v_mov_b32_e32 v5, v17
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[16:17]
 ; GFX942-NEXT:    global_store_dwordx4 v18, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -10372,8 +10201,7 @@ define void @v_shuffle_v2i64_v8i64__8_11(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v6
-; GFX942-NEXT:    v_mov_b32_e32 v3, v7
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[6:7]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -10418,8 +10246,7 @@ define void @v_shuffle_v2i64_v8i64__9_11(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v4, v6
-; GFX942-NEXT:    v_mov_b32_e32 v5, v7
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[6:7]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -10504,8 +10331,7 @@ define void @v_shuffle_v2i64_v8i64__11_11(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v8, v6
-; GFX942-NEXT:    v_mov_b32_e32 v9, v7
+; GFX942-NEXT:    v_mov_b64_e32 v[8:9], v[6:7]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[6:9], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -10550,8 +10376,7 @@ define void @v_shuffle_v2i64_v8i64__12_11(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v10, v6
-; GFX942-NEXT:    v_mov_b32_e32 v11, v7
+; GFX942-NEXT:    v_mov_b64_e32 v[10:11], v[6:7]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -10596,8 +10421,7 @@ define void @v_shuffle_v2i64_v8i64__13_11(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v12, v6
-; GFX942-NEXT:    v_mov_b32_e32 v13, v7
+; GFX942-NEXT:    v_mov_b64_e32 v[12:13], v[6:7]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[10:13], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -10642,8 +10466,7 @@ define void @v_shuffle_v2i64_v8i64__14_11(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v14, v6
-; GFX942-NEXT:    v_mov_b32_e32 v15, v7
+; GFX942-NEXT:    v_mov_b64_e32 v[14:15], v[6:7]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[12:15], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -10738,8 +10561,7 @@ define void @v_shuffle_v2i64_v8i64__0_12(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[2:17]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v10
-; GFX942-NEXT:    v_mov_b32_e32 v3, v11
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[10:11]
 ; GFX942-NEXT:    global_store_dwordx4 v18, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -10794,8 +10616,7 @@ define void @v_shuffle_v2i64_v8i64__1_12(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[4:19]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v4, v12
-; GFX942-NEXT:    v_mov_b32_e32 v5, v13
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[12:13]
 ; GFX942-NEXT:    global_store_dwordx4 v20, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -10850,8 +10671,7 @@ define void @v_shuffle_v2i64_v8i64__2_12(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[6:21]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v6, v14
-; GFX942-NEXT:    v_mov_b32_e32 v7, v15
+; GFX942-NEXT:    v_mov_b64_e32 v[6:7], v[14:15]
 ; GFX942-NEXT:    global_store_dwordx4 v22, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -10906,8 +10726,7 @@ define void @v_shuffle_v2i64_v8i64__3_12(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[8:23]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v8, v16
-; GFX942-NEXT:    v_mov_b32_e32 v9, v17
+; GFX942-NEXT:    v_mov_b64_e32 v[8:9], v[16:17]
 ; GFX942-NEXT:    global_store_dwordx4 v24, v[6:9], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -10962,8 +10781,7 @@ define void @v_shuffle_v2i64_v8i64__4_12(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[10:25]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v10, v18
-; GFX942-NEXT:    v_mov_b32_e32 v11, v19
+; GFX942-NEXT:    v_mov_b64_e32 v[10:11], v[18:19]
 ; GFX942-NEXT:    global_store_dwordx4 v26, v[8:11], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -11018,8 +10836,7 @@ define void @v_shuffle_v2i64_v8i64__5_12(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[12:27]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v12, v20
-; GFX942-NEXT:    v_mov_b32_e32 v13, v21
+; GFX942-NEXT:    v_mov_b64_e32 v[12:13], v[20:21]
 ; GFX942-NEXT:    global_store_dwordx4 v28, v[10:13], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -11074,8 +10891,7 @@ define void @v_shuffle_v2i64_v8i64__6_12(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[14:29]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v14, v22
-; GFX942-NEXT:    v_mov_b32_e32 v15, v23
+; GFX942-NEXT:    v_mov_b64_e32 v[14:15], v[22:23]
 ; GFX942-NEXT:    global_store_dwordx4 v30, v[12:15], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -11130,8 +10946,7 @@ define void @v_shuffle_v2i64_v8i64__7_12(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v6, v16
-; GFX942-NEXT:    v_mov_b32_e32 v7, v17
+; GFX942-NEXT:    v_mov_b64_e32 v[6:7], v[16:17]
 ; GFX942-NEXT:    global_store_dwordx4 v18, v[6:9], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -11176,8 +10991,7 @@ define void @v_shuffle_v2i64_v8i64__8_12(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v8
-; GFX942-NEXT:    v_mov_b32_e32 v3, v9
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[8:9]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -11222,8 +11036,7 @@ define void @v_shuffle_v2i64_v8i64__9_12(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v4, v8
-; GFX942-NEXT:    v_mov_b32_e32 v5, v9
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[8:9]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -11268,8 +11081,7 @@ define void @v_shuffle_v2i64_v8i64__10_12(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v6, v8
-; GFX942-NEXT:    v_mov_b32_e32 v7, v9
+; GFX942-NEXT:    v_mov_b64_e32 v[6:7], v[8:9]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -11354,8 +11166,7 @@ define void @v_shuffle_v2i64_v8i64__12_12(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v10, v8
-; GFX942-NEXT:    v_mov_b32_e32 v11, v9
+; GFX942-NEXT:    v_mov_b64_e32 v[10:11], v[8:9]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -11400,8 +11211,7 @@ define void @v_shuffle_v2i64_v8i64__13_12(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v12, v8
-; GFX942-NEXT:    v_mov_b32_e32 v13, v9
+; GFX942-NEXT:    v_mov_b64_e32 v[12:13], v[8:9]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[10:13], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -11446,8 +11256,7 @@ define void @v_shuffle_v2i64_v8i64__14_12(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v14, v8
-; GFX942-NEXT:    v_mov_b32_e32 v15, v9
+; GFX942-NEXT:    v_mov_b64_e32 v[14:15], v[8:9]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[12:15], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -11542,8 +11351,7 @@ define void @v_shuffle_v2i64_v8i64__0_13(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[2:17]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v12
-; GFX942-NEXT:    v_mov_b32_e32 v3, v13
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[12:13]
 ; GFX942-NEXT:    global_store_dwordx4 v18, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -11598,8 +11406,7 @@ define void @v_shuffle_v2i64_v8i64__1_13(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[4:19]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v4, v14
-; GFX942-NEXT:    v_mov_b32_e32 v5, v15
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[14:15]
 ; GFX942-NEXT:    global_store_dwordx4 v20, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -11654,8 +11461,7 @@ define void @v_shuffle_v2i64_v8i64__2_13(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[6:21]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v6, v16
-; GFX942-NEXT:    v_mov_b32_e32 v7, v17
+; GFX942-NEXT:    v_mov_b64_e32 v[6:7], v[16:17]
 ; GFX942-NEXT:    global_store_dwordx4 v22, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -11710,8 +11516,7 @@ define void @v_shuffle_v2i64_v8i64__3_13(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[8:23]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v8, v18
-; GFX942-NEXT:    v_mov_b32_e32 v9, v19
+; GFX942-NEXT:    v_mov_b64_e32 v[8:9], v[18:19]
 ; GFX942-NEXT:    global_store_dwordx4 v24, v[6:9], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -11766,8 +11571,7 @@ define void @v_shuffle_v2i64_v8i64__4_13(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[10:25]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v10, v20
-; GFX942-NEXT:    v_mov_b32_e32 v11, v21
+; GFX942-NEXT:    v_mov_b64_e32 v[10:11], v[20:21]
 ; GFX942-NEXT:    global_store_dwordx4 v26, v[8:11], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -11822,8 +11626,7 @@ define void @v_shuffle_v2i64_v8i64__5_13(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[12:27]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v12, v22
-; GFX942-NEXT:    v_mov_b32_e32 v13, v23
+; GFX942-NEXT:    v_mov_b64_e32 v[12:13], v[22:23]
 ; GFX942-NEXT:    global_store_dwordx4 v28, v[10:13], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -11878,8 +11681,7 @@ define void @v_shuffle_v2i64_v8i64__6_13(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[14:29]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v14, v24
-; GFX942-NEXT:    v_mov_b32_e32 v15, v25
+; GFX942-NEXT:    v_mov_b64_e32 v[14:15], v[24:25]
 ; GFX942-NEXT:    global_store_dwordx4 v30, v[12:15], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -11934,8 +11736,7 @@ define void @v_shuffle_v2i64_v8i64__7_13(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v8, v16
-; GFX942-NEXT:    v_mov_b32_e32 v9, v17
+; GFX942-NEXT:    v_mov_b64_e32 v[8:9], v[16:17]
 ; GFX942-NEXT:    global_store_dwordx4 v18, v[8:11], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -11980,8 +11781,7 @@ define void @v_shuffle_v2i64_v8i64__8_13(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v10
-; GFX942-NEXT:    v_mov_b32_e32 v3, v11
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[10:11]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -12026,8 +11826,7 @@ define void @v_shuffle_v2i64_v8i64__9_13(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v4, v10
-; GFX942-NEXT:    v_mov_b32_e32 v5, v11
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[10:11]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -12072,8 +11871,7 @@ define void @v_shuffle_v2i64_v8i64__10_13(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v6, v10
-; GFX942-NEXT:    v_mov_b32_e32 v7, v11
+; GFX942-NEXT:    v_mov_b64_e32 v[6:7], v[10:11]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -12118,8 +11916,7 @@ define void @v_shuffle_v2i64_v8i64__11_13(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v8, v10
-; GFX942-NEXT:    v_mov_b32_e32 v9, v11
+; GFX942-NEXT:    v_mov_b64_e32 v[8:9], v[10:11]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[6:9], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -12204,8 +12001,7 @@ define void @v_shuffle_v2i64_v8i64__13_13(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v12, v10
-; GFX942-NEXT:    v_mov_b32_e32 v13, v11
+; GFX942-NEXT:    v_mov_b64_e32 v[12:13], v[10:11]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[10:13], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -12250,8 +12046,7 @@ define void @v_shuffle_v2i64_v8i64__14_13(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v14, v10
-; GFX942-NEXT:    v_mov_b32_e32 v15, v11
+; GFX942-NEXT:    v_mov_b64_e32 v[14:15], v[10:11]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[12:15], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -12346,8 +12141,7 @@ define void @v_shuffle_v2i64_v8i64__0_14(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[2:17]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v14
-; GFX942-NEXT:    v_mov_b32_e32 v3, v15
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[14:15]
 ; GFX942-NEXT:    global_store_dwordx4 v18, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -12402,8 +12196,7 @@ define void @v_shuffle_v2i64_v8i64__1_14(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[4:19]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v4, v16
-; GFX942-NEXT:    v_mov_b32_e32 v5, v17
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[16:17]
 ; GFX942-NEXT:    global_store_dwordx4 v20, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -12458,8 +12251,7 @@ define void @v_shuffle_v2i64_v8i64__2_14(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[6:21]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v6, v18
-; GFX942-NEXT:    v_mov_b32_e32 v7, v19
+; GFX942-NEXT:    v_mov_b64_e32 v[6:7], v[18:19]
 ; GFX942-NEXT:    global_store_dwordx4 v22, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -12514,8 +12306,7 @@ define void @v_shuffle_v2i64_v8i64__3_14(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[8:23]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v8, v20
-; GFX942-NEXT:    v_mov_b32_e32 v9, v21
+; GFX942-NEXT:    v_mov_b64_e32 v[8:9], v[20:21]
 ; GFX942-NEXT:    global_store_dwordx4 v24, v[6:9], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -12570,8 +12361,7 @@ define void @v_shuffle_v2i64_v8i64__4_14(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[10:25]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v10, v22
-; GFX942-NEXT:    v_mov_b32_e32 v11, v23
+; GFX942-NEXT:    v_mov_b64_e32 v[10:11], v[22:23]
 ; GFX942-NEXT:    global_store_dwordx4 v26, v[8:11], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -12626,8 +12416,7 @@ define void @v_shuffle_v2i64_v8i64__5_14(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[12:27]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v12, v24
-; GFX942-NEXT:    v_mov_b32_e32 v13, v25
+; GFX942-NEXT:    v_mov_b64_e32 v[12:13], v[24:25]
 ; GFX942-NEXT:    global_store_dwordx4 v28, v[10:13], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -12682,8 +12471,7 @@ define void @v_shuffle_v2i64_v8i64__6_14(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[14:29]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v14, v26
-; GFX942-NEXT:    v_mov_b32_e32 v15, v27
+; GFX942-NEXT:    v_mov_b64_e32 v[14:15], v[26:27]
 ; GFX942-NEXT:    global_store_dwordx4 v30, v[12:15], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -12738,8 +12526,7 @@ define void @v_shuffle_v2i64_v8i64__7_14(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v10, v16
-; GFX942-NEXT:    v_mov_b32_e32 v11, v17
+; GFX942-NEXT:    v_mov_b64_e32 v[10:11], v[16:17]
 ; GFX942-NEXT:    global_store_dwordx4 v18, v[10:13], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -12784,8 +12571,7 @@ define void @v_shuffle_v2i64_v8i64__8_14(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v12
-; GFX942-NEXT:    v_mov_b32_e32 v3, v13
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[12:13]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -12830,8 +12616,7 @@ define void @v_shuffle_v2i64_v8i64__9_14(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v4, v12
-; GFX942-NEXT:    v_mov_b32_e32 v5, v13
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[12:13]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -12876,8 +12661,7 @@ define void @v_shuffle_v2i64_v8i64__10_14(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v6, v12
-; GFX942-NEXT:    v_mov_b32_e32 v7, v13
+; GFX942-NEXT:    v_mov_b64_e32 v[6:7], v[12:13]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -12922,8 +12706,7 @@ define void @v_shuffle_v2i64_v8i64__11_14(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v8, v12
-; GFX942-NEXT:    v_mov_b32_e32 v9, v13
+; GFX942-NEXT:    v_mov_b64_e32 v[8:9], v[12:13]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[6:9], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -12968,8 +12751,7 @@ define void @v_shuffle_v2i64_v8i64__12_14(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v10, v12
-; GFX942-NEXT:    v_mov_b32_e32 v11, v13
+; GFX942-NEXT:    v_mov_b64_e32 v[10:11], v[12:13]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -13054,8 +12836,7 @@ define void @v_shuffle_v2i64_v8i64__14_14(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v14, v12
-; GFX942-NEXT:    v_mov_b32_e32 v15, v13
+; GFX942-NEXT:    v_mov_b64_e32 v[14:15], v[12:13]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[12:15], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -13150,8 +12931,7 @@ define void @v_shuffle_v2i64_v8i64__0_15(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[2:17]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v16
-; GFX942-NEXT:    v_mov_b32_e32 v3, v17
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[16:17]
 ; GFX942-NEXT:    global_store_dwordx4 v18, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -13206,8 +12986,7 @@ define void @v_shuffle_v2i64_v8i64__1_15(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[4:19]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v4, v18
-; GFX942-NEXT:    v_mov_b32_e32 v5, v19
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[18:19]
 ; GFX942-NEXT:    global_store_dwordx4 v20, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -13262,8 +13041,7 @@ define void @v_shuffle_v2i64_v8i64__2_15(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[6:21]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v6, v20
-; GFX942-NEXT:    v_mov_b32_e32 v7, v21
+; GFX942-NEXT:    v_mov_b64_e32 v[6:7], v[20:21]
 ; GFX942-NEXT:    global_store_dwordx4 v22, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -13318,8 +13096,7 @@ define void @v_shuffle_v2i64_v8i64__3_15(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[8:23]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v8, v22
-; GFX942-NEXT:    v_mov_b32_e32 v9, v23
+; GFX942-NEXT:    v_mov_b64_e32 v[8:9], v[22:23]
 ; GFX942-NEXT:    global_store_dwordx4 v24, v[6:9], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -13374,8 +13151,7 @@ define void @v_shuffle_v2i64_v8i64__4_15(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[10:25]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v10, v24
-; GFX942-NEXT:    v_mov_b32_e32 v11, v25
+; GFX942-NEXT:    v_mov_b64_e32 v[10:11], v[24:25]
 ; GFX942-NEXT:    global_store_dwordx4 v26, v[8:11], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -13430,8 +13206,7 @@ define void @v_shuffle_v2i64_v8i64__5_15(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[12:27]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v12, v26
-; GFX942-NEXT:    v_mov_b32_e32 v13, v27
+; GFX942-NEXT:    v_mov_b64_e32 v[12:13], v[26:27]
 ; GFX942-NEXT:    global_store_dwordx4 v28, v[10:13], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -13486,8 +13261,7 @@ define void @v_shuffle_v2i64_v8i64__6_15(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[14:29]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v14, v28
-; GFX942-NEXT:    v_mov_b32_e32 v15, v29
+; GFX942-NEXT:    v_mov_b64_e32 v[14:15], v[28:29]
 ; GFX942-NEXT:    global_store_dwordx4 v30, v[12:15], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -13542,8 +13316,7 @@ define void @v_shuffle_v2i64_v8i64__7_15(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v12, v16
-; GFX942-NEXT:    v_mov_b32_e32 v13, v17
+; GFX942-NEXT:    v_mov_b64_e32 v[12:13], v[16:17]
 ; GFX942-NEXT:    global_store_dwordx4 v18, v[12:15], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -13588,8 +13361,7 @@ define void @v_shuffle_v2i64_v8i64__8_15(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v14
-; GFX942-NEXT:    v_mov_b32_e32 v3, v15
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[14:15]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -13634,8 +13406,7 @@ define void @v_shuffle_v2i64_v8i64__9_15(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v4, v14
-; GFX942-NEXT:    v_mov_b32_e32 v5, v15
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[14:15]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -13680,8 +13451,7 @@ define void @v_shuffle_v2i64_v8i64__10_15(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v6, v14
-; GFX942-NEXT:    v_mov_b32_e32 v7, v15
+; GFX942-NEXT:    v_mov_b64_e32 v[6:7], v[14:15]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -13726,8 +13496,7 @@ define void @v_shuffle_v2i64_v8i64__11_15(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v8, v14
-; GFX942-NEXT:    v_mov_b32_e32 v9, v15
+; GFX942-NEXT:    v_mov_b64_e32 v[8:9], v[14:15]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[6:9], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -13772,8 +13541,7 @@ define void @v_shuffle_v2i64_v8i64__12_15(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v10, v14
-; GFX942-NEXT:    v_mov_b32_e32 v11, v15
+; GFX942-NEXT:    v_mov_b64_e32 v[10:11], v[14:15]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -13818,8 +13586,7 @@ define void @v_shuffle_v2i64_v8i64__13_15(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v12, v14
-; GFX942-NEXT:    v_mov_b32_e32 v13, v15
+; GFX942-NEXT:    v_mov_b64_e32 v[12:13], v[14:15]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[10:13], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -13957,8 +13724,7 @@ define void @s_shuffle_v2i64_v8i64__1_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14042,8 +13808,7 @@ define void @s_shuffle_v2i64_v8i64__3_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14131,8 +13896,7 @@ define void @s_shuffle_v2i64_v8i64__5_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14221,8 +13985,7 @@ define void @s_shuffle_v2i64_v8i64__7_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14280,8 +14043,7 @@ define void @s_shuffle_v2i64_v8i64__9_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14367,8 +14129,7 @@ define void @s_shuffle_v2i64_v8i64__11_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14458,8 +14219,7 @@ define void @s_shuffle_v2i64_v8i64__13_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14550,8 +14310,7 @@ define void @s_shuffle_v2i64_v8i64__15_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14610,10 +14369,8 @@ define void @s_shuffle_v2i64_v8i64__15_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:19]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s18
-; GFX942-NEXT:    s_mov_b32 s9, s19
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[18:19]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14668,8 +14425,7 @@ define void @s_shuffle_v2i64_v8i64__15_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[12:27]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s26
-; GFX942-NEXT:    s_mov_b32 s9, s27
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[26:27]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14750,10 +14506,8 @@ define void @s_shuffle_v2i64_v8i64__15_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:23]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s22
-; GFX942-NEXT:    s_mov_b32 s9, s23
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[22:23]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14808,8 +14562,7 @@ define void @s_shuffle_v2i64_v8i64__15_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[12:27]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s26
-; GFX942-NEXT:    s_mov_b32 s9, s27
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[26:27]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14894,10 +14647,8 @@ define void @s_shuffle_v2i64_v8i64__15_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[16:31]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s30
-; GFX942-NEXT:    s_mov_b32 s9, s31
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[30:31]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14984,8 +14735,7 @@ define void @s_shuffle_v2i64_v8i64__15_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[12:27]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s26
-; GFX942-NEXT:    s_mov_b32 s9, s27
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[26:27]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -15094,10 +14844,8 @@ define void @s_shuffle_v2i64_v8i64__15_6() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[16:31]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s30
-; GFX942-NEXT:    s_mov_b32 s9, s31
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[30:31]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -15212,8 +14960,7 @@ define void @s_shuffle_v2i64_v8i64__15_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[16:31]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s30
-; GFX942-NEXT:    s_mov_b32 s13, s31
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[30:31]
 ; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
 ; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -15270,10 +15017,8 @@ define void @s_shuffle_v2i64_v8i64__15_8() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -15286,18 +15031,43 @@ define void @s_shuffle_v2i64_v8i64__15_8() {
 }
 
 define void @s_shuffle_v2i64_v8i64__15_9() {
-; GFX9-LABEL: s_shuffle_v2i64_v8i64__15_9:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:23]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s22
-; GFX9-NEXT:    s_mov_b32 s9, s23
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v8i64__15_9:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:23]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s22
+; GFX900-NEXT:    s_mov_b32 s9, s23
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v8i64__15_9:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:23]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s22
+; GFX90A-NEXT:    s_mov_b32 s9, s23
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v8i64__15_9:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:23]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[22:23]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <8 x i64> asm "; def $0", "=s"()
   %vec1 = call <8 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> <i32 15, i32 9>
@@ -15342,10 +15112,8 @@ define void @s_shuffle_v2i64_v8i64__15_10() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -15358,18 +15126,43 @@ define void @s_shuffle_v2i64_v8i64__15_10() {
 }
 
 define void @s_shuffle_v2i64_v8i64__15_11() {
-; GFX9-LABEL: s_shuffle_v2i64_v8i64__15_11:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[4:19]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s18
-; GFX9-NEXT:    s_mov_b32 s9, s19
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v8i64__15_11:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:19]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s18
+; GFX900-NEXT:    s_mov_b32 s9, s19
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v8i64__15_11:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:19]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s18
+; GFX90A-NEXT:    s_mov_b32 s9, s19
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v8i64__15_11:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[4:19]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[18:19]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <8 x i64> asm "; def $0", "=s"()
   %vec1 = call <8 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> <i32 15, i32 11>
@@ -15378,20 +15171,48 @@ define void @s_shuffle_v2i64_v8i64__15_11() {
 }
 
 define void @s_shuffle_v2i64_v8i64__15_12() {
-; GFX9-LABEL: s_shuffle_v2i64_v8i64__15_12:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[4:19]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s18
-; GFX9-NEXT:    s_mov_b32 s9, s19
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v8i64__15_12:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:19]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s18
+; GFX900-NEXT:    s_mov_b32 s9, s19
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v8i64__15_12:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:19]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s18
+; GFX90A-NEXT:    s_mov_b32 s9, s19
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v8i64__15_12:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[4:19]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[18:19]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <8 x i64> asm "; def $0", "=s"()
   %vec1 = call <8 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> <i32 15, i32 12>
@@ -15436,8 +15257,7 @@ define void @s_shuffle_v2i64_v8i64__15_13() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -15486,10 +15306,8 @@ define void @s_shuffle_v2i64_v8i64__15_14() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -15538,8 +15356,7 @@ define void @s_shuffle_v2i64_v8i64__15_15() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
 ; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -15586,8 +15403,7 @@ define void @s_shuffle_v2i64_v8i64__u_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -15599,18 +15415,43 @@ define void @s_shuffle_v2i64_v8i64__u_0() {
 }
 
 define void @s_shuffle_v2i64_v8i64__0_0() {
-; GFX9-LABEL: s_shuffle_v2i64_v8i64__0_0:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:23]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s8
-; GFX9-NEXT:    s_mov_b32 s11, s9
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v8i64__0_0:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:23]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s8
+; GFX900-NEXT:    s_mov_b32 s11, s9
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v8i64__0_0:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:23]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s8
+; GFX90A-NEXT:    s_mov_b32 s11, s9
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v8i64__0_0:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:23]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[8:9]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <8 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> zeroinitializer
   call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf)
@@ -15654,10 +15495,8 @@ define void @s_shuffle_v2i64_v8i64__1_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -15669,18 +15508,43 @@ define void @s_shuffle_v2i64_v8i64__1_0() {
 }
 
 define void @s_shuffle_v2i64_v8i64__2_0() {
-; GFX9-LABEL: s_shuffle_v2i64_v8i64__2_0:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[4:19]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s4
-; GFX9-NEXT:    s_mov_b32 s11, s5
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v8i64__2_0:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:19]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s4
+; GFX900-NEXT:    s_mov_b32 s11, s5
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v8i64__2_0:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:19]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s4
+; GFX90A-NEXT:    s_mov_b32 s11, s5
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v8i64__2_0:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[4:19]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <8 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> <i32 2, i32 0>
   call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf)
@@ -15724,10 +15588,8 @@ define void @s_shuffle_v2i64_v8i64__3_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -15775,8 +15637,7 @@ define void @s_shuffle_v2i64_v8i64__4_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -15824,10 +15685,8 @@ define void @s_shuffle_v2i64_v8i64__5_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -15875,8 +15734,7 @@ define void @s_shuffle_v2i64_v8i64__6_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
 ; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -15926,10 +15784,8 @@ define void @s_shuffle_v2i64_v8i64__7_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -15973,8 +15829,7 @@ define void @s_shuffle_v2i64_v8i64__8_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -16032,10 +15887,8 @@ define void @s_shuffle_v2i64_v8i64__9_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:19]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -16136,8 +15989,7 @@ define void @s_shuffle_v2i64_v8i64__10_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:19]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -16196,10 +16048,8 @@ define void @s_shuffle_v2i64_v8i64__11_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:19]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -16262,8 +16112,7 @@ define void @s_shuffle_v2i64_v8i64__12_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s16
-; GFX942-NEXT:    s_mov_b32 s11, s17
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[16:17]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -16328,10 +16177,8 @@ define void @s_shuffle_v2i64_v8i64__13_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:19]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -16390,8 +16237,7 @@ define void @s_shuffle_v2i64_v8i64__14_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:19]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s18, s0
-; GFX942-NEXT:    s_mov_b32 s19, s1
+; GFX942-NEXT:    s_mov_b64 s[18:19], s[0:1]
 ; GFX942-NEXT:    s_mov_b64 s[8:9], s[16:17]
 ; GFX942-NEXT:    s_mov_b64 s[10:11], s[18:19]
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -16486,18 +16332,43 @@ define void @s_shuffle_v2i64_v8i64__0_1() {
 }
 
 define void @s_shuffle_v2i64_v8i64__1_1() {
-; GFX9-LABEL: s_shuffle_v2i64_v8i64__1_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:23]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v8i64__1_1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:23]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v8i64__1_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:23]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v8i64__1_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:23]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <8 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> <i32 1, i32 1>
   call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf)
@@ -16505,18 +16376,43 @@ define void @s_shuffle_v2i64_v8i64__1_1() {
 }
 
 define void @s_shuffle_v2i64_v8i64__2_1() {
-; GFX9-LABEL: s_shuffle_v2i64_v8i64__2_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[4:19]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s6
-; GFX9-NEXT:    s_mov_b32 s11, s7
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v8i64__2_1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:19]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s6
+; GFX900-NEXT:    s_mov_b32 s11, s7
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v8i64__2_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:19]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s6
+; GFX90A-NEXT:    s_mov_b32 s11, s7
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v8i64__2_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[4:19]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <8 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> <i32 2, i32 1>
   call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf)
@@ -16524,18 +16420,43 @@ define void @s_shuffle_v2i64_v8i64__2_1() {
 }
 
 define void @s_shuffle_v2i64_v8i64__3_1() {
-; GFX9-LABEL: s_shuffle_v2i64_v8i64__3_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:23]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v8i64__3_1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:23]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v8i64__3_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:23]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v8i64__3_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:23]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <8 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> <i32 3, i32 1>
   call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf)
@@ -16579,8 +16500,7 @@ define void @s_shuffle_v2i64_v8i64__4_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s2
-; GFX942-NEXT:    s_mov_b32 s11, s3
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -16592,18 +16512,43 @@ define void @s_shuffle_v2i64_v8i64__4_1() {
 }
 
 define void @s_shuffle_v2i64_v8i64__5_1() {
-; GFX9-LABEL: s_shuffle_v2i64_v8i64__5_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:23]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s18
-; GFX9-NEXT:    s_mov_b32 s9, s19
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v8i64__5_1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:23]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s18
+; GFX900-NEXT:    s_mov_b32 s9, s19
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v8i64__5_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:23]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s18
+; GFX90A-NEXT:    s_mov_b32 s9, s19
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v8i64__5_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:23]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[18:19]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <8 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> <i32 5, i32 1>
   call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf)
@@ -16647,8 +16592,7 @@ define void @s_shuffle_v2i64_v8i64__6_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
 ; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -16662,18 +16606,43 @@ define void @s_shuffle_v2i64_v8i64__6_1() {
 }
 
 define void @s_shuffle_v2i64_v8i64__7_1() {
-; GFX9-LABEL: s_shuffle_v2i64_v8i64__7_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:23]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s22
-; GFX9-NEXT:    s_mov_b32 s9, s23
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v8i64__7_1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:23]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s22
+; GFX900-NEXT:    s_mov_b32 s9, s23
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v8i64__7_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:23]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s22
+; GFX90A-NEXT:    s_mov_b32 s9, s23
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v8i64__7_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:23]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[22:23]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <8 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> <i32 7, i32 1>
   call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf)
@@ -16763,8 +16732,7 @@ define void @s_shuffle_v2i64_v8i64__9_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[12:27]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -16865,8 +16833,7 @@ define void @s_shuffle_v2i64_v8i64__10_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:19]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s2
-; GFX942-NEXT:    s_mov_b32 s11, s3
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -16921,8 +16888,7 @@ define void @s_shuffle_v2i64_v8i64__11_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[12:27]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s18
-; GFX942-NEXT:    s_mov_b32 s9, s19
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[18:19]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -16985,8 +16951,7 @@ define void @s_shuffle_v2i64_v8i64__12_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s18
-; GFX942-NEXT:    s_mov_b32 s11, s19
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[18:19]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -17047,8 +17012,7 @@ define void @s_shuffle_v2i64_v8i64__13_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[12:27]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s22
-; GFX942-NEXT:    s_mov_b32 s9, s23
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[22:23]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -17107,8 +17071,7 @@ define void @s_shuffle_v2i64_v8i64__14_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:19]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s18, s2
-; GFX942-NEXT:    s_mov_b32 s19, s3
+; GFX942-NEXT:    s_mov_b64 s[18:19], s[2:3]
 ; GFX942-NEXT:    s_mov_b64 s[8:9], s[16:17]
 ; GFX942-NEXT:    s_mov_b64 s[10:11], s[18:19]
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -17155,8 +17118,7 @@ define void @s_shuffle_v2i64_v8i64__u_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -17168,18 +17130,43 @@ define void @s_shuffle_v2i64_v8i64__u_2() {
 }
 
 define void @s_shuffle_v2i64_v8i64__0_2() {
-; GFX9-LABEL: s_shuffle_v2i64_v8i64__0_2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:23]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v8i64__0_2:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:23]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v8i64__0_2:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:23]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v8i64__0_2:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:23]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <8 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> <i32 0, i32 2>
   call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf)
@@ -17223,10 +17210,8 @@ define void @s_shuffle_v2i64_v8i64__1_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -17238,18 +17223,43 @@ define void @s_shuffle_v2i64_v8i64__1_2() {
 }
 
 define void @s_shuffle_v2i64_v8i64__2_2() {
-; GFX9-LABEL: s_shuffle_v2i64_v8i64__2_2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[4:19]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s8
-; GFX9-NEXT:    s_mov_b32 s11, s9
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v8i64__2_2:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:19]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s8
+; GFX900-NEXT:    s_mov_b32 s11, s9
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v8i64__2_2:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:19]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s8
+; GFX90A-NEXT:    s_mov_b32 s11, s9
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v8i64__2_2:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[4:19]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[8:9]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <8 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> <i32 2, i32 2>
   call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf)
@@ -17293,10 +17303,8 @@ define void @s_shuffle_v2i64_v8i64__3_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -17344,8 +17352,7 @@ define void @s_shuffle_v2i64_v8i64__4_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -17393,10 +17400,8 @@ define void @s_shuffle_v2i64_v8i64__5_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -17444,8 +17449,7 @@ define void @s_shuffle_v2i64_v8i64__6_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
 ; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -17495,10 +17499,8 @@ define void @s_shuffle_v2i64_v8i64__7_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -17542,8 +17544,7 @@ define void @s_shuffle_v2i64_v8i64__8_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -17623,10 +17624,8 @@ define void @s_shuffle_v2i64_v8i64__9_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:23]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -17708,8 +17707,7 @@ define void @s_shuffle_v2i64_v8i64__10_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:19]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s20
-; GFX942-NEXT:    s_mov_b32 s11, s21
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[20:21]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -17796,10 +17794,8 @@ define void @s_shuffle_v2i64_v8i64__11_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:23]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -17858,8 +17854,7 @@ define void @s_shuffle_v2i64_v8i64__12_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s16
-; GFX942-NEXT:    s_mov_b32 s11, s17
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[16:17]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -17940,10 +17935,8 @@ define void @s_shuffle_v2i64_v8i64__13_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:23]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s18
-; GFX942-NEXT:    s_mov_b32 s9, s19
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[18:19]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -18002,8 +17995,7 @@ define void @s_shuffle_v2i64_v8i64__14_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:23]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s22, s4
-; GFX942-NEXT:    s_mov_b32 s23, s5
+; GFX942-NEXT:    s_mov_b64 s[22:23], s[4:5]
 ; GFX942-NEXT:    s_mov_b64 s[8:9], s[20:21]
 ; GFX942-NEXT:    s_mov_b64 s[10:11], s[22:23]
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -18058,37 +18050,87 @@ define void @s_shuffle_v2i64_v8i64__u_3() {
 }
 
 define void @s_shuffle_v2i64_v8i64__0_3() {
-; GFX9-LABEL: s_shuffle_v2i64_v8i64__0_3:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:23]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s14
-; GFX9-NEXT:    s_mov_b32 s11, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <8 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> <i32 0, i32 3>
-  call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v2i64_v8i64__1_3() {
-; GFX9-LABEL: s_shuffle_v2i64_v8i64__1_3:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[4:19]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s6
-; GFX9-NEXT:    s_mov_b32 s9, s7
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v8i64__0_3:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:23]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v8i64__0_3:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:23]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v8i64__0_3:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:23]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <8 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> <i32 0, i32 3>
+  call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v2i64_v8i64__1_3() {
+; GFX900-LABEL: s_shuffle_v2i64_v8i64__1_3:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:19]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s6
+; GFX900-NEXT:    s_mov_b32 s9, s7
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v8i64__1_3:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:19]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s6
+; GFX90A-NEXT:    s_mov_b32 s9, s7
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v8i64__1_3:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[4:19]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <8 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> <i32 1, i32 3>
   call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf)
@@ -18136,18 +18178,43 @@ define void @s_shuffle_v2i64_v8i64__2_3() {
 }
 
 define void @s_shuffle_v2i64_v8i64__3_3() {
-; GFX9-LABEL: s_shuffle_v2i64_v8i64__3_3:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[4:19]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v8i64__3_3:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:19]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v8i64__3_3:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:19]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v8i64__3_3:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[4:19]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <8 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> <i32 3, i32 3>
   call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf)
@@ -18191,8 +18258,7 @@ define void @s_shuffle_v2i64_v8i64__4_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -18204,18 +18270,43 @@ define void @s_shuffle_v2i64_v8i64__4_3() {
 }
 
 define void @s_shuffle_v2i64_v8i64__5_3() {
-; GFX9-LABEL: s_shuffle_v2i64_v8i64__5_3:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[4:19]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v8i64__5_3:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:19]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v8i64__5_3:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:19]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v8i64__5_3:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[4:19]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <8 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> <i32 5, i32 3>
   call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf)
@@ -18259,8 +18350,7 @@ define void @s_shuffle_v2i64_v8i64__6_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s14, s6
-; GFX942-NEXT:    s_mov_b32 s15, s7
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[6:7]
 ; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
 ; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -18274,18 +18364,43 @@ define void @s_shuffle_v2i64_v8i64__6_3() {
 }
 
 define void @s_shuffle_v2i64_v8i64__7_3() {
-; GFX9-LABEL: s_shuffle_v2i64_v8i64__7_3:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[4:19]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s18
-; GFX9-NEXT:    s_mov_b32 s9, s19
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v8i64__7_3:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:19]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s18
+; GFX900-NEXT:    s_mov_b32 s9, s19
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v8i64__7_3:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:19]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s18
+; GFX90A-NEXT:    s_mov_b32 s9, s19
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v8i64__7_3:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[4:19]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[18:19]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <8 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> <i32 7, i32 3>
   call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf)
@@ -18375,8 +18490,7 @@ define void @s_shuffle_v2i64_v8i64__9_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[12:27]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -18458,8 +18572,7 @@ define void @s_shuffle_v2i64_v8i64__10_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:19]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s22
-; GFX942-NEXT:    s_mov_b32 s11, s23
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[22:23]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -18520,8 +18633,7 @@ define void @s_shuffle_v2i64_v8i64__11_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[12:27]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s18
-; GFX942-NEXT:    s_mov_b32 s9, s19
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[18:19]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -18580,8 +18692,7 @@ define void @s_shuffle_v2i64_v8i64__12_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s18
-; GFX942-NEXT:    s_mov_b32 s11, s19
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[18:19]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -18636,8 +18747,7 @@ define void @s_shuffle_v2i64_v8i64__13_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[12:27]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s22
-; GFX942-NEXT:    s_mov_b32 s9, s23
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[22:23]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -18696,8 +18806,7 @@ define void @s_shuffle_v2i64_v8i64__14_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:23]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s22, s6
-; GFX942-NEXT:    s_mov_b32 s23, s7
+; GFX942-NEXT:    s_mov_b64 s[22:23], s[6:7]
 ; GFX942-NEXT:    s_mov_b64 s[8:9], s[20:21]
 ; GFX942-NEXT:    s_mov_b64 s[10:11], s[22:23]
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -18744,8 +18853,7 @@ define void @s_shuffle_v2i64_v8i64__u_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s8
-; GFX942-NEXT:    s_mov_b32 s11, s9
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[8:9]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -18757,18 +18865,43 @@ define void @s_shuffle_v2i64_v8i64__u_4() {
 }
 
 define void @s_shuffle_v2i64_v8i64__0_4() {
-; GFX9-LABEL: s_shuffle_v2i64_v8i64__0_4:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:23]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s16
-; GFX9-NEXT:    s_mov_b32 s11, s17
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v8i64__0_4:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:23]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s16
+; GFX900-NEXT:    s_mov_b32 s11, s17
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v8i64__0_4:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:23]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s16
+; GFX90A-NEXT:    s_mov_b32 s11, s17
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v8i64__0_4:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:23]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[16:17]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <8 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> <i32 0, i32 4>
   call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf)
@@ -18776,20 +18909,48 @@ define void @s_shuffle_v2i64_v8i64__0_4() {
 }
 
 define void @s_shuffle_v2i64_v8i64__1_4() {
-; GFX9-LABEL: s_shuffle_v2i64_v8i64__1_4:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[4:19]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s6
-; GFX9-NEXT:    s_mov_b32 s9, s7
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v8i64__1_4:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:19]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s6
+; GFX900-NEXT:    s_mov_b32 s9, s7
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v8i64__1_4:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:19]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s6
+; GFX90A-NEXT:    s_mov_b32 s9, s7
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v8i64__1_4:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[4:19]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <8 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> <i32 1, i32 4>
   call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf)
@@ -18797,18 +18958,43 @@ define void @s_shuffle_v2i64_v8i64__1_4() {
 }
 
 define void @s_shuffle_v2i64_v8i64__2_4() {
-; GFX9-LABEL: s_shuffle_v2i64_v8i64__2_4:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[4:19]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v8i64__2_4:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:19]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v8i64__2_4:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:19]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v8i64__2_4:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[4:19]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <8 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> <i32 2, i32 4>
   call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf)
@@ -18816,90 +19002,145 @@ define void @s_shuffle_v2i64_v8i64__2_4() {
 }
 
 define void @s_shuffle_v2i64_v8i64__3_4() {
-; GFX9-LABEL: s_shuffle_v2i64_v8i64__3_4:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[4:19]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v8i64__3_4:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:19]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v8i64__3_4:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:19]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v8i64__3_4:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[4:19]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <8 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> <i32 3, i32 4>
+  call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v2i64_v8i64__4_4() {
+; GFX900-LABEL: s_shuffle_v2i64_v8i64__4_4:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:19]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
+; GFX900-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX900-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v8i64__4_4:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:19]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
+; GFX90A-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX90A-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v8i64__4_4:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[8:9]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <8 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> <i32 3, i32 4>
+  %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> <i32 4, i32 4>
   call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__4_4() {
-; GFX900-LABEL: s_shuffle_v2i64_v8i64__4_4:
+define void @s_shuffle_v2i64_v8i64__5_4() {
+; GFX900-LABEL: s_shuffle_v2i64_v8i64__5_4:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:19]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s14, s12
-; GFX900-NEXT:    s_mov_b32 s15, s13
-; GFX900-NEXT:    s_mov_b64 s[8:9], s[12:13]
-; GFX900-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v2i64_v8i64__4_4:
+; GFX90A-LABEL: s_shuffle_v2i64_v8i64__5_4:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:19]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s14, s12
-; GFX90A-NEXT:    s_mov_b32 s15, s13
-; GFX90A-NEXT:    s_mov_b64 s[8:9], s[12:13]
-; GFX90A-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v2i64_v8i64__4_4:
+; GFX942-LABEL: s_shuffle_v2i64_v8i64__5_4:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:15]
+; GFX942-NEXT:    ; def s[4:19]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s8
-; GFX942-NEXT:    s_mov_b32 s11, s9
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <8 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> <i32 4, i32 4>
-  call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v2i64_v8i64__5_4() {
-; GFX9-LABEL: s_shuffle_v2i64_v8i64__5_4:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[4:19]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <8 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> <i32 5, i32 4>
   call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf)
@@ -18943,8 +19184,7 @@ define void @s_shuffle_v2i64_v8i64__6_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s14, s8
-; GFX942-NEXT:    s_mov_b32 s15, s9
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[8:9]
 ; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
 ; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -18958,20 +19198,48 @@ define void @s_shuffle_v2i64_v8i64__6_4() {
 }
 
 define void @s_shuffle_v2i64_v8i64__7_4() {
-; GFX9-LABEL: s_shuffle_v2i64_v8i64__7_4:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[4:19]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s18
-; GFX9-NEXT:    s_mov_b32 s9, s19
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v8i64__7_4:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:19]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s18
+; GFX900-NEXT:    s_mov_b32 s9, s19
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v8i64__7_4:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:19]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s18
+; GFX90A-NEXT:    s_mov_b32 s9, s19
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v8i64__7_4:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[4:19]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[18:19]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <8 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> <i32 7, i32 4>
   call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf)
@@ -19011,8 +19279,7 @@ define void @s_shuffle_v2i64_v8i64__8_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s8
-; GFX942-NEXT:    s_mov_b32 s11, s9
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[8:9]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -19096,10 +19363,8 @@ define void @s_shuffle_v2i64_v8i64__9_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[16:31]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s18
-; GFX942-NEXT:    s_mov_b32 s9, s19
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[18:19]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -19160,8 +19425,7 @@ define void @s_shuffle_v2i64_v8i64__10_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:19]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s20
-; GFX942-NEXT:    s_mov_b32 s11, s21
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[20:21]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -19246,10 +19510,8 @@ define void @s_shuffle_v2i64_v8i64__11_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[16:31]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s22
-; GFX942-NEXT:    s_mov_b32 s9, s23
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[22:23]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -19336,8 +19598,7 @@ define void @s_shuffle_v2i64_v8i64__12_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s16
-; GFX942-NEXT:    s_mov_b32 s11, s17
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[16:17]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -19422,10 +19683,8 @@ define void @s_shuffle_v2i64_v8i64__13_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[16:31]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s26
-; GFX942-NEXT:    s_mov_b32 s9, s27
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[26:27]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -19512,8 +19771,7 @@ define void @s_shuffle_v2i64_v8i64__14_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[12:27]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s26, s8
-; GFX942-NEXT:    s_mov_b32 s27, s9
+; GFX942-NEXT:    s_mov_b64 s[26:27], s[8:9]
 ; GFX942-NEXT:    s_mov_b64 s[8:9], s[24:25]
 ; GFX942-NEXT:    s_mov_b64 s[10:11], s[26:27]
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -19572,18 +19830,43 @@ define void @s_shuffle_v2i64_v8i64__u_5() {
 }
 
 define void @s_shuffle_v2i64_v8i64__0_5() {
-; GFX9-LABEL: s_shuffle_v2i64_v8i64__0_5:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:23]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s18
-; GFX9-NEXT:    s_mov_b32 s11, s19
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v8i64__0_5:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:23]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s18
+; GFX900-NEXT:    s_mov_b32 s11, s19
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v8i64__0_5:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:23]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s18
+; GFX90A-NEXT:    s_mov_b32 s11, s19
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v8i64__0_5:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:23]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[18:19]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <8 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> <i32 0, i32 5>
   call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf)
@@ -19627,8 +19910,7 @@ define void @s_shuffle_v2i64_v8i64__1_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -19640,18 +19922,43 @@ define void @s_shuffle_v2i64_v8i64__1_5() {
 }
 
 define void @s_shuffle_v2i64_v8i64__2_5() {
-; GFX9-LABEL: s_shuffle_v2i64_v8i64__2_5:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[4:19]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s14
-; GFX9-NEXT:    s_mov_b32 s11, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v8i64__2_5:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:19]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v8i64__2_5:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:19]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v8i64__2_5:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[4:19]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <8 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> <i32 2, i32 5>
   call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf)
@@ -19695,8 +20002,7 @@ define void @s_shuffle_v2i64_v8i64__3_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -19788,8 +20094,7 @@ define void @s_shuffle_v2i64_v8i64__5_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -19837,8 +20142,7 @@ define void @s_shuffle_v2i64_v8i64__6_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s14, s10
-; GFX942-NEXT:    s_mov_b32 s15, s11
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
 ; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
 ; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -19888,8 +20192,7 @@ define void @s_shuffle_v2i64_v8i64__7_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -20013,8 +20316,7 @@ define void @s_shuffle_v2i64_v8i64__9_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[12:27]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -20069,8 +20371,7 @@ define void @s_shuffle_v2i64_v8i64__10_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:19]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s22
-; GFX942-NEXT:    s_mov_b32 s11, s23
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[22:23]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -20151,8 +20452,7 @@ define void @s_shuffle_v2i64_v8i64__11_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[12:27]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s18
-; GFX942-NEXT:    s_mov_b32 s9, s19
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[18:19]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -20233,8 +20533,7 @@ define void @s_shuffle_v2i64_v8i64__12_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s18
-; GFX942-NEXT:    s_mov_b32 s11, s19
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[18:19]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -20315,8 +20614,7 @@ define void @s_shuffle_v2i64_v8i64__13_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[12:27]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s22
-; GFX942-NEXT:    s_mov_b32 s9, s23
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[22:23]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -20397,8 +20695,7 @@ define void @s_shuffle_v2i64_v8i64__14_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[12:27]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s26, s10
-; GFX942-NEXT:    s_mov_b32 s27, s11
+; GFX942-NEXT:    s_mov_b64 s[26:27], s[10:11]
 ; GFX942-NEXT:    s_mov_b64 s[8:9], s[24:25]
 ; GFX942-NEXT:    s_mov_b64 s[10:11], s[26:27]
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -20445,8 +20742,7 @@ define void @s_shuffle_v2i64_v8i64__u_6() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -20458,18 +20754,43 @@ define void @s_shuffle_v2i64_v8i64__u_6() {
 }
 
 define void @s_shuffle_v2i64_v8i64__0_6() {
-; GFX9-LABEL: s_shuffle_v2i64_v8i64__0_6:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:23]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s20
-; GFX9-NEXT:    s_mov_b32 s11, s21
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v8i64__0_6:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:23]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s20
+; GFX900-NEXT:    s_mov_b32 s11, s21
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v8i64__0_6:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:23]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s20
+; GFX90A-NEXT:    s_mov_b32 s11, s21
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v8i64__0_6:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:23]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[20:21]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <8 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> <i32 0, i32 6>
   call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf)
@@ -20511,35 +20832,58 @@ define void @s_shuffle_v2i64_v8i64__1_6() {
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:15]
+; GFX942-NEXT:    ; def s[0:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <8 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> <i32 1, i32 6>
+  call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v2i64_v8i64__2_6() {
+; GFX900-LABEL: s_shuffle_v2i64_v8i64__2_6:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:19]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s16
+; GFX900-NEXT:    s_mov_b32 s11, s17
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v8i64__2_6:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:19]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s16
+; GFX90A-NEXT:    s_mov_b32 s11, s17
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v8i64__2_6:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[4:19]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[16:17]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <8 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> <i32 1, i32 6>
-  call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v2i64_v8i64__2_6() {
-; GFX9-LABEL: s_shuffle_v2i64_v8i64__2_6:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[4:19]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s16
-; GFX9-NEXT:    s_mov_b32 s11, s17
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <8 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> <i32 2, i32 6>
   call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf)
@@ -20583,10 +20927,8 @@ define void @s_shuffle_v2i64_v8i64__3_6() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -20634,8 +20976,7 @@ define void @s_shuffle_v2i64_v8i64__4_6() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -20683,10 +21024,8 @@ define void @s_shuffle_v2i64_v8i64__5_6() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -20734,8 +21073,7 @@ define void @s_shuffle_v2i64_v8i64__6_6() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s14, s12
-; GFX942-NEXT:    s_mov_b32 s15, s13
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
 ; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -20785,10 +21123,8 @@ define void @s_shuffle_v2i64_v8i64__7_6() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -20832,8 +21168,7 @@ define void @s_shuffle_v2i64_v8i64__8_6() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -20941,10 +21276,8 @@ define void @s_shuffle_v2i64_v8i64__9_6() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[16:31]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s18
-; GFX942-NEXT:    s_mov_b32 s9, s19
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[18:19]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -21005,8 +21338,7 @@ define void @s_shuffle_v2i64_v8i64__10_6() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:19]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s20
-; GFX942-NEXT:    s_mov_b32 s11, s21
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[20:21]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -21115,10 +21447,8 @@ define void @s_shuffle_v2i64_v8i64__11_6() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[16:31]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s22
-; GFX942-NEXT:    s_mov_b32 s9, s23
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[22:23]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -21229,8 +21559,7 @@ define void @s_shuffle_v2i64_v8i64__12_6() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s16
-; GFX942-NEXT:    s_mov_b32 s11, s17
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[16:17]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -21339,10 +21668,8 @@ define void @s_shuffle_v2i64_v8i64__13_6() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[16:31]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s26
-; GFX942-NEXT:    s_mov_b32 s9, s27
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[26:27]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -21457,8 +21784,7 @@ define void @s_shuffle_v2i64_v8i64__14_6() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[16:31]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s30, s12
-; GFX942-NEXT:    s_mov_b32 s31, s13
+; GFX942-NEXT:    s_mov_b64 s[30:31], s[12:13]
 ; GFX942-NEXT:    s_mov_b64 s[8:9], s[28:29]
 ; GFX942-NEXT:    s_mov_b64 s[10:11], s[30:31]
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -21524,18 +21850,43 @@ define void @s_shuffle_v2i64_v8i64__u_7() {
 }
 
 define void @s_shuffle_v2i64_v8i64__0_7() {
-; GFX9-LABEL: s_shuffle_v2i64_v8i64__0_7:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:23]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s22
-; GFX9-NEXT:    s_mov_b32 s11, s23
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v8i64__0_7:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:23]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s22
+; GFX900-NEXT:    s_mov_b32 s11, s23
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v8i64__0_7:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:23]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s22
+; GFX90A-NEXT:    s_mov_b32 s11, s23
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v8i64__0_7:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:23]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[22:23]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <8 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> <i32 0, i32 7>
   call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf)
@@ -21579,8 +21930,7 @@ define void @s_shuffle_v2i64_v8i64__1_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
 ; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -21594,18 +21944,43 @@ define void @s_shuffle_v2i64_v8i64__1_7() {
 }
 
 define void @s_shuffle_v2i64_v8i64__2_7() {
-; GFX9-LABEL: s_shuffle_v2i64_v8i64__2_7:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[4:19]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s18
-; GFX9-NEXT:    s_mov_b32 s11, s19
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v8i64__2_7:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:19]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s18
+; GFX900-NEXT:    s_mov_b32 s11, s19
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v8i64__2_7:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:19]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s18
+; GFX90A-NEXT:    s_mov_b32 s11, s19
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v8i64__2_7:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[4:19]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[18:19]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <8 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> <i32 2, i32 7>
   call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf)
@@ -21649,8 +22024,7 @@ define void @s_shuffle_v2i64_v8i64__3_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
 ; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
 ; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -21700,8 +22074,7 @@ define void @s_shuffle_v2i64_v8i64__4_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -21749,8 +22122,7 @@ define void @s_shuffle_v2i64_v8i64__5_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s10
-; GFX942-NEXT:    s_mov_b32 s13, s11
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
 ; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
 ; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -21845,8 +22217,7 @@ define void @s_shuffle_v2i64_v8i64__7_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
 ; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -22001,8 +22372,7 @@ define void @s_shuffle_v2i64_v8i64__9_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[16:31]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s18
-; GFX942-NEXT:    s_mov_b32 s13, s19
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[18:19]
 ; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
 ; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -22065,8 +22435,7 @@ define void @s_shuffle_v2i64_v8i64__10_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:19]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s22
-; GFX942-NEXT:    s_mov_b32 s11, s23
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[22:23]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -22175,8 +22544,7 @@ define void @s_shuffle_v2i64_v8i64__11_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[16:31]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s22
-; GFX942-NEXT:    s_mov_b32 s13, s23
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[22:23]
 ; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
 ; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -22289,8 +22657,7 @@ define void @s_shuffle_v2i64_v8i64__12_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s18
-; GFX942-NEXT:    s_mov_b32 s11, s19
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[18:19]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -22399,8 +22766,7 @@ define void @s_shuffle_v2i64_v8i64__13_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[16:31]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s26
-; GFX942-NEXT:    s_mov_b32 s13, s27
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[26:27]
 ; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
 ; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -22517,8 +22883,7 @@ define void @s_shuffle_v2i64_v8i64__14_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[16:31]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s30, s14
-; GFX942-NEXT:    s_mov_b32 s31, s15
+; GFX942-NEXT:    s_mov_b64 s[30:31], s[14:15]
 ; GFX942-NEXT:    s_mov_b64 s[8:9], s[28:29]
 ; GFX942-NEXT:    s_mov_b64 s[10:11], s[30:31]
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -22625,8 +22990,7 @@ define void @s_shuffle_v2i64_v8i64__1_8() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -22710,8 +23074,7 @@ define void @s_shuffle_v2i64_v8i64__3_8() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -22799,8 +23162,7 @@ define void @s_shuffle_v2i64_v8i64__5_8() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -22889,8 +23251,7 @@ define void @s_shuffle_v2i64_v8i64__7_8() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -22952,10 +23313,8 @@ define void @s_shuffle_v2i64_v8i64__9_8() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -22968,18 +23327,43 @@ define void @s_shuffle_v2i64_v8i64__9_8() {
 }
 
 define void @s_shuffle_v2i64_v8i64__10_8() {
-; GFX9-LABEL: s_shuffle_v2i64_v8i64__10_8:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[4:19]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s4
-; GFX9-NEXT:    s_mov_b32 s11, s5
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v8i64__10_8:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:19]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s4
+; GFX900-NEXT:    s_mov_b32 s11, s5
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v8i64__10_8:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:19]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s4
+; GFX90A-NEXT:    s_mov_b32 s11, s5
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v8i64__10_8:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[4:19]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <8 x i64> asm "; def $0", "=s"()
   %vec1 = call <8 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> <i32 10, i32 8>
@@ -23024,10 +23408,8 @@ define void @s_shuffle_v2i64_v8i64__11_8() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -23076,8 +23458,7 @@ define void @s_shuffle_v2i64_v8i64__12_8() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -23126,10 +23507,8 @@ define void @s_shuffle_v2i64_v8i64__13_8() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -23178,8 +23557,7 @@ define void @s_shuffle_v2i64_v8i64__14_8() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
 ; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -23277,8 +23655,7 @@ define void @s_shuffle_v2i64_v8i64__0_9() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[12:27]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -23333,8 +23710,7 @@ define void @s_shuffle_v2i64_v8i64__1_9() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:23]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -23389,8 +23765,7 @@ define void @s_shuffle_v2i64_v8i64__2_9() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[12:27]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -23491,8 +23866,7 @@ define void @s_shuffle_v2i64_v8i64__3_9() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:23]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -23573,8 +23947,7 @@ define void @s_shuffle_v2i64_v8i64__4_9() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[12:27]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -23656,8 +24029,7 @@ define void @s_shuffle_v2i64_v8i64__5_9() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:23]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s26
-; GFX942-NEXT:    s_mov_b32 s9, s27
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[26:27]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -23772,8 +24144,7 @@ define void @s_shuffle_v2i64_v8i64__6_9() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[16:31]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s14, s18
-; GFX942-NEXT:    s_mov_b32 s15, s19
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[18:19]
 ; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
 ; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -23836,8 +24207,7 @@ define void @s_shuffle_v2i64_v8i64__7_9() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:23]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s26
-; GFX942-NEXT:    s_mov_b32 s9, s27
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[26:27]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -23891,18 +24261,43 @@ define void @s_shuffle_v2i64_v8i64__8_9() {
 }
 
 define void @s_shuffle_v2i64_v8i64__9_9() {
-; GFX9-LABEL: s_shuffle_v2i64_v8i64__9_9:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:23]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v8i64__9_9:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:23]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v8i64__9_9:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:23]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v8i64__9_9:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:23]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <8 x i64> asm "; def $0", "=s"()
   %vec1 = call <8 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> <i32 9, i32 9>
@@ -23911,18 +24306,43 @@ define void @s_shuffle_v2i64_v8i64__9_9() {
 }
 
 define void @s_shuffle_v2i64_v8i64__10_9() {
-; GFX9-LABEL: s_shuffle_v2i64_v8i64__10_9:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[4:19]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s6
-; GFX9-NEXT:    s_mov_b32 s11, s7
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v8i64__10_9:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:19]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s6
+; GFX900-NEXT:    s_mov_b32 s11, s7
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v8i64__10_9:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:19]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s6
+; GFX90A-NEXT:    s_mov_b32 s11, s7
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v8i64__10_9:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[4:19]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <8 x i64> asm "; def $0", "=s"()
   %vec1 = call <8 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> <i32 10, i32 9>
@@ -23931,18 +24351,43 @@ define void @s_shuffle_v2i64_v8i64__10_9() {
 }
 
 define void @s_shuffle_v2i64_v8i64__11_9() {
-; GFX9-LABEL: s_shuffle_v2i64_v8i64__11_9:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:23]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v8i64__11_9:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:23]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v8i64__11_9:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:23]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v8i64__11_9:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:23]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <8 x i64> asm "; def $0", "=s"()
   %vec1 = call <8 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> <i32 11, i32 9>
@@ -23987,8 +24432,7 @@ define void @s_shuffle_v2i64_v8i64__12_9() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s2
-; GFX942-NEXT:    s_mov_b32 s11, s3
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -24001,18 +24445,43 @@ define void @s_shuffle_v2i64_v8i64__12_9() {
 }
 
 define void @s_shuffle_v2i64_v8i64__13_9() {
-; GFX9-LABEL: s_shuffle_v2i64_v8i64__13_9:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:23]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s18
-; GFX9-NEXT:    s_mov_b32 s9, s19
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v8i64__13_9:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:23]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s18
+; GFX900-NEXT:    s_mov_b32 s9, s19
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v8i64__13_9:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:23]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s18
+; GFX90A-NEXT:    s_mov_b32 s9, s19
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v8i64__13_9:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:23]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[18:19]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <8 x i64> asm "; def $0", "=s"()
   %vec1 = call <8 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> <i32 13, i32 9>
@@ -24057,8 +24526,7 @@ define void @s_shuffle_v2i64_v8i64__14_9() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
 ; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -24105,8 +24573,7 @@ define void @s_shuffle_v2i64_v8i64__u_10() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -24161,8 +24628,7 @@ define void @s_shuffle_v2i64_v8i64__0_10() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[12:27]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s16
-; GFX942-NEXT:    s_mov_b32 s11, s17
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[16:17]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -24221,10 +24687,8 @@ define void @s_shuffle_v2i64_v8i64__1_10() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:23]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -24279,8 +24743,7 @@ define void @s_shuffle_v2i64_v8i64__2_10() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[12:27]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s16
-; GFX942-NEXT:    s_mov_b32 s11, s17
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[16:17]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -24339,10 +24802,8 @@ define void @s_shuffle_v2i64_v8i64__3_10() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:23]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -24423,8 +24884,7 @@ define void @s_shuffle_v2i64_v8i64__4_10() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[12:27]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s16
-; GFX942-NEXT:    s_mov_b32 s11, s17
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[16:17]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -24501,13 +24961,11 @@ define void @s_shuffle_v2i64_v8i64__5_10() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[12:27]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s10, s16
-; GFX942-NEXT:    s_mov_b32 s11, s17
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[16:17]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -24616,8 +25074,7 @@ define void @s_shuffle_v2i64_v8i64__6_10() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[16:31]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s14, s20
-; GFX942-NEXT:    s_mov_b32 s15, s21
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[20:21]
 ; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
 ; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -24734,10 +25191,8 @@ define void @s_shuffle_v2i64_v8i64__7_10() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[16:31]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s20
-; GFX942-NEXT:    s_mov_b32 s11, s21
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[20:21]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -24756,18 +25211,43 @@ define void @s_shuffle_v2i64_v8i64__7_10() {
 }
 
 define void @s_shuffle_v2i64_v8i64__8_10() {
-; GFX9-LABEL: s_shuffle_v2i64_v8i64__8_10:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:23]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v8i64__8_10:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:23]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v8i64__8_10:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:23]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v8i64__8_10:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:23]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <8 x i64> asm "; def $0", "=s"()
   %vec1 = call <8 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> <i32 8, i32 10>
@@ -24812,10 +25292,8 @@ define void @s_shuffle_v2i64_v8i64__9_10() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -24828,18 +25306,43 @@ define void @s_shuffle_v2i64_v8i64__9_10() {
 }
 
 define void @s_shuffle_v2i64_v8i64__10_10() {
-; GFX9-LABEL: s_shuffle_v2i64_v8i64__10_10:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[4:19]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s8
-; GFX9-NEXT:    s_mov_b32 s11, s9
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v8i64__10_10:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:19]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s8
+; GFX900-NEXT:    s_mov_b32 s11, s9
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v8i64__10_10:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:19]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s8
+; GFX90A-NEXT:    s_mov_b32 s11, s9
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v8i64__10_10:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[4:19]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[8:9]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <8 x i64> asm "; def $0", "=s"()
   %vec1 = call <8 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> <i32 10, i32 10>
@@ -24884,10 +25387,8 @@ define void @s_shuffle_v2i64_v8i64__11_10() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -24936,8 +25437,7 @@ define void @s_shuffle_v2i64_v8i64__12_10() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -24986,10 +25486,8 @@ define void @s_shuffle_v2i64_v8i64__13_10() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -25038,8 +25536,7 @@ define void @s_shuffle_v2i64_v8i64__14_10() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
 ; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -25137,8 +25634,7 @@ define void @s_shuffle_v2i64_v8i64__0_11() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[12:27]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s18
-; GFX942-NEXT:    s_mov_b32 s11, s19
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[18:19]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -25239,8 +25735,7 @@ define void @s_shuffle_v2i64_v8i64__1_11() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:19]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -25295,8 +25790,7 @@ define void @s_shuffle_v2i64_v8i64__2_11() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[12:27]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s18
-; GFX942-NEXT:    s_mov_b32 s11, s19
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[18:19]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -25378,8 +25872,7 @@ define void @s_shuffle_v2i64_v8i64__3_11() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:19]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s22
-; GFX942-NEXT:    s_mov_b32 s9, s23
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[22:23]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -25466,8 +25959,7 @@ define void @s_shuffle_v2i64_v8i64__4_11() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[12:27]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s18
-; GFX942-NEXT:    s_mov_b32 s11, s19
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[18:19]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -25522,8 +26014,7 @@ define void @s_shuffle_v2i64_v8i64__5_11() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:19]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s22
-; GFX942-NEXT:    s_mov_b32 s9, s23
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[22:23]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -25632,8 +26123,7 @@ define void @s_shuffle_v2i64_v8i64__6_11() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[16:31]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s14, s22
-; GFX942-NEXT:    s_mov_b32 s15, s23
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[22:23]
 ; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
 ; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -25696,8 +26186,7 @@ define void @s_shuffle_v2i64_v8i64__7_11() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:19]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s22
-; GFX942-NEXT:    s_mov_b32 s9, s23
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[22:23]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -25710,18 +26199,43 @@ define void @s_shuffle_v2i64_v8i64__7_11() {
 }
 
 define void @s_shuffle_v2i64_v8i64__8_11() {
-; GFX9-LABEL: s_shuffle_v2i64_v8i64__8_11:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:23]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s14
-; GFX9-NEXT:    s_mov_b32 s11, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v8i64__8_11:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:23]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v8i64__8_11:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:23]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v8i64__8_11:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:23]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <8 x i64> asm "; def $0", "=s"()
   %vec1 = call <8 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> <i32 8, i32 11>
@@ -25730,18 +26244,43 @@ define void @s_shuffle_v2i64_v8i64__8_11() {
 }
 
 define void @s_shuffle_v2i64_v8i64__9_11() {
-; GFX9-LABEL: s_shuffle_v2i64_v8i64__9_11:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[4:19]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s6
-; GFX9-NEXT:    s_mov_b32 s9, s7
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v8i64__9_11:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:19]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s6
+; GFX900-NEXT:    s_mov_b32 s9, s7
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v8i64__9_11:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:19]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s6
+; GFX90A-NEXT:    s_mov_b32 s9, s7
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v8i64__9_11:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[4:19]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <8 x i64> asm "; def $0", "=s"()
   %vec1 = call <8 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> <i32 9, i32 11>
@@ -25791,18 +26330,43 @@ define void @s_shuffle_v2i64_v8i64__10_11() {
 }
 
 define void @s_shuffle_v2i64_v8i64__11_11() {
-; GFX9-LABEL: s_shuffle_v2i64_v8i64__11_11:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[4:19]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v8i64__11_11:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:19]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v8i64__11_11:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:19]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v8i64__11_11:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[4:19]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <8 x i64> asm "; def $0", "=s"()
   %vec1 = call <8 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> <i32 11, i32 11>
@@ -25847,8 +26411,7 @@ define void @s_shuffle_v2i64_v8i64__12_11() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -25861,18 +26424,43 @@ define void @s_shuffle_v2i64_v8i64__12_11() {
 }
 
 define void @s_shuffle_v2i64_v8i64__13_11() {
-; GFX9-LABEL: s_shuffle_v2i64_v8i64__13_11:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[4:19]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v8i64__13_11:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:19]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v8i64__13_11:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:19]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v8i64__13_11:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[4:19]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <8 x i64> asm "; def $0", "=s"()
   %vec1 = call <8 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> <i32 13, i32 11>
@@ -25917,8 +26505,7 @@ define void @s_shuffle_v2i64_v8i64__14_11() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s14, s6
-; GFX942-NEXT:    s_mov_b32 s15, s7
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[6:7]
 ; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
 ; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -25965,8 +26552,7 @@ define void @s_shuffle_v2i64_v8i64__u_12() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s8
-; GFX942-NEXT:    s_mov_b32 s11, s9
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[8:9]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -26021,8 +26607,7 @@ define void @s_shuffle_v2i64_v8i64__0_12() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[12:27]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s20
-; GFX942-NEXT:    s_mov_b32 s11, s21
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[20:21]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -26081,10 +26666,8 @@ define void @s_shuffle_v2i64_v8i64__1_12() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:19]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -26139,8 +26722,7 @@ define void @s_shuffle_v2i64_v8i64__2_12() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[12:27]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s20
-; GFX942-NEXT:    s_mov_b32 s11, s21
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[20:21]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -26199,10 +26781,8 @@ define void @s_shuffle_v2i64_v8i64__3_12() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:23]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s16
-; GFX942-NEXT:    s_mov_b32 s11, s17
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[16:17]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -26283,8 +26863,7 @@ define void @s_shuffle_v2i64_v8i64__4_12() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[12:27]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s20
-; GFX942-NEXT:    s_mov_b32 s11, s21
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[20:21]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -26361,13 +26940,11 @@ define void @s_shuffle_v2i64_v8i64__5_12() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[12:27]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s10, s20
-; GFX942-NEXT:    s_mov_b32 s11, s21
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[20:21]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -26476,8 +27053,7 @@ define void @s_shuffle_v2i64_v8i64__6_12() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[16:31]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s14, s24
-; GFX942-NEXT:    s_mov_b32 s15, s25
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[24:25]
 ; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
 ; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -26539,159 +27115,263 @@ define void @s_shuffle_v2i64_v8i64__7_12() {
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v2i64_v8i64__7_12:
+; GFX90A-LABEL: s_shuffle_v2i64_v8i64__7_12:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX90A-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX90A-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX90A-NEXT:    v_writelane_b32 v0, s36, 0
+; GFX90A-NEXT:    v_writelane_b32 v0, s37, 1
+; GFX90A-NEXT:    v_writelane_b32 v0, s38, 2
+; GFX90A-NEXT:    v_writelane_b32 v0, s39, 3
+; GFX90A-NEXT:    v_writelane_b32 v0, s48, 4
+; GFX90A-NEXT:    v_writelane_b32 v0, s49, 5
+; GFX90A-NEXT:    v_writelane_b32 v0, s50, 6
+; GFX90A-NEXT:    v_writelane_b32 v0, s51, 7
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:19]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[36:51]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s18
+; GFX90A-NEXT:    s_mov_b32 s9, s19
+; GFX90A-NEXT:    s_mov_b32 s10, s44
+; GFX90A-NEXT:    s_mov_b32 s11, s45
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    v_readlane_b32 s51, v0, 7
+; GFX90A-NEXT:    v_readlane_b32 s50, v0, 6
+; GFX90A-NEXT:    v_readlane_b32 s49, v0, 5
+; GFX90A-NEXT:    v_readlane_b32 s48, v0, 4
+; GFX90A-NEXT:    v_readlane_b32 s39, v0, 3
+; GFX90A-NEXT:    v_readlane_b32 s38, v0, 2
+; GFX90A-NEXT:    v_readlane_b32 s37, v0, 1
+; GFX90A-NEXT:    v_readlane_b32 s36, v0, 0
+; GFX90A-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX90A-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX90A-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v8i64__7_12:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    s_xor_saveexec_b64 s[0:1], -1
+; GFX942-NEXT:    scratch_store_dword off, v0, s32 ; 4-byte Folded Spill
+; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX942-NEXT:    v_writelane_b32 v0, s30, 0
+; GFX942-NEXT:    v_writelane_b32 v0, s31, 1
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[16:31]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[24:25]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    v_readlane_b32 s31, v0, 1
+; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
+; GFX942-NEXT:    s_xor_saveexec_b64 s[0:1], -1
+; GFX942-NEXT:    scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
+; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <8 x i64> asm "; def $0", "=s"()
+  %vec1 = call <8 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> <i32 7, i32 12>
+  call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v2i64_v8i64__8_12() {
+; GFX900-LABEL: s_shuffle_v2i64_v8i64__8_12:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:23]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s16
+; GFX900-NEXT:    s_mov_b32 s11, s17
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v8i64__8_12:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:23]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s16
+; GFX90A-NEXT:    s_mov_b32 s11, s17
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v8i64__8_12:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:23]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[16:17]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <8 x i64> asm "; def $0", "=s"()
+  %vec1 = call <8 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> <i32 8, i32 12>
+  call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v2i64_v8i64__9_12() {
+; GFX900-LABEL: s_shuffle_v2i64_v8i64__9_12:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:19]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s6
+; GFX900-NEXT:    s_mov_b32 s9, s7
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v8i64__9_12:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:19]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s6
+; GFX90A-NEXT:    s_mov_b32 s9, s7
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v8i64__9_12:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[4:19]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <8 x i64> asm "; def $0", "=s"()
+  %vec1 = call <8 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> <i32 9, i32 12>
+  call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v2i64_v8i64__10_12() {
+; GFX900-LABEL: s_shuffle_v2i64_v8i64__10_12:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:19]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v8i64__10_12:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:19]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v8i64__10_12:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[4:19]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <8 x i64> asm "; def $0", "=s"()
+  %vec1 = call <8 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> <i32 10, i32 12>
+  call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v2i64_v8i64__11_12() {
+; GFX900-LABEL: s_shuffle_v2i64_v8i64__11_12:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:19]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v8i64__11_12:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX90A-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX90A-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX90A-NEXT:    v_writelane_b32 v0, s36, 0
-; GFX90A-NEXT:    v_writelane_b32 v0, s37, 1
-; GFX90A-NEXT:    v_writelane_b32 v0, s38, 2
-; GFX90A-NEXT:    v_writelane_b32 v0, s39, 3
-; GFX90A-NEXT:    v_writelane_b32 v0, s48, 4
-; GFX90A-NEXT:    v_writelane_b32 v0, s49, 5
-; GFX90A-NEXT:    v_writelane_b32 v0, s50, 6
-; GFX90A-NEXT:    v_writelane_b32 v0, s51, 7
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:19]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[36:51]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s18
-; GFX90A-NEXT:    s_mov_b32 s9, s19
-; GFX90A-NEXT:    s_mov_b32 s10, s44
-; GFX90A-NEXT:    s_mov_b32 s11, s45
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_readlane_b32 s51, v0, 7
-; GFX90A-NEXT:    v_readlane_b32 s50, v0, 6
-; GFX90A-NEXT:    v_readlane_b32 s49, v0, 5
-; GFX90A-NEXT:    v_readlane_b32 s48, v0, 4
-; GFX90A-NEXT:    v_readlane_b32 s39, v0, 3
-; GFX90A-NEXT:    v_readlane_b32 s38, v0, 2
-; GFX90A-NEXT:    v_readlane_b32 s37, v0, 1
-; GFX90A-NEXT:    v_readlane_b32 s36, v0, 0
-; GFX90A-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX90A-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX90A-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v2i64_v8i64__7_12:
+; GFX942-LABEL: s_shuffle_v2i64_v8i64__11_12:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    s_xor_saveexec_b64 s[0:1], -1
-; GFX942-NEXT:    scratch_store_dword off, v0, s32 ; 4-byte Folded Spill
-; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX942-NEXT:    v_writelane_b32 v0, s30, 0
-; GFX942-NEXT:    v_writelane_b32 v0, s31, 1
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:15]
-; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[16:31]
+; GFX942-NEXT:    ; def s[4:19]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s24
-; GFX942-NEXT:    s_mov_b32 s11, s25
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
-; GFX942-NEXT:    s_xor_saveexec_b64 s[0:1], -1
-; GFX942-NEXT:    scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
-; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <8 x i64> asm "; def $0", "=s"()
-  %vec1 = call <8 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> <i32 7, i32 12>
-  call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v2i64_v8i64__8_12() {
-; GFX9-LABEL: s_shuffle_v2i64_v8i64__8_12:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:23]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s16
-; GFX9-NEXT:    s_mov_b32 s11, s17
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <8 x i64> asm "; def $0", "=s"()
-  %vec1 = call <8 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> <i32 8, i32 12>
-  call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v2i64_v8i64__9_12() {
-; GFX9-LABEL: s_shuffle_v2i64_v8i64__9_12:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[4:19]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s6
-; GFX9-NEXT:    s_mov_b32 s9, s7
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <8 x i64> asm "; def $0", "=s"()
-  %vec1 = call <8 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> <i32 9, i32 12>
-  call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v2i64_v8i64__10_12() {
-; GFX9-LABEL: s_shuffle_v2i64_v8i64__10_12:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[4:19]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <8 x i64> asm "; def $0", "=s"()
-  %vec1 = call <8 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> <i32 10, i32 12>
-  call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v2i64_v8i64__11_12() {
-; GFX9-LABEL: s_shuffle_v2i64_v8i64__11_12:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[4:19]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <8 x i64> asm "; def $0", "=s"()
   %vec1 = call <8 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> <i32 11, i32 12>
@@ -26736,8 +27416,7 @@ define void @s_shuffle_v2i64_v8i64__12_12() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s8
-; GFX942-NEXT:    s_mov_b32 s11, s9
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[8:9]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -26750,20 +27429,48 @@ define void @s_shuffle_v2i64_v8i64__12_12() {
 }
 
 define void @s_shuffle_v2i64_v8i64__13_12() {
-; GFX9-LABEL: s_shuffle_v2i64_v8i64__13_12:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[4:19]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v8i64__13_12:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:19]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v8i64__13_12:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:19]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v8i64__13_12:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[4:19]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <8 x i64> asm "; def $0", "=s"()
   %vec1 = call <8 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> <i32 13, i32 12>
@@ -26808,8 +27515,7 @@ define void @s_shuffle_v2i64_v8i64__14_12() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s14, s8
-; GFX942-NEXT:    s_mov_b32 s15, s9
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[8:9]
 ; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
 ; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -26911,8 +27617,7 @@ define void @s_shuffle_v2i64_v8i64__0_13() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[12:27]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s22
-; GFX942-NEXT:    s_mov_b32 s11, s23
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[22:23]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -26975,8 +27680,7 @@ define void @s_shuffle_v2i64_v8i64__1_13() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s18
-; GFX942-NEXT:    s_mov_b32 s9, s19
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[18:19]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -27037,8 +27741,7 @@ define void @s_shuffle_v2i64_v8i64__2_13() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[12:27]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s22
-; GFX942-NEXT:    s_mov_b32 s11, s23
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[22:23]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -27097,8 +27800,7 @@ define void @s_shuffle_v2i64_v8i64__3_13() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s18
-; GFX942-NEXT:    s_mov_b32 s9, s19
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[18:19]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -27179,8 +27881,7 @@ define void @s_shuffle_v2i64_v8i64__4_13() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[12:27]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s22
-; GFX942-NEXT:    s_mov_b32 s11, s23
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[22:23]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -27261,8 +27962,7 @@ define void @s_shuffle_v2i64_v8i64__5_13() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s18
-; GFX942-NEXT:    s_mov_b32 s9, s19
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[18:19]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -27371,8 +28071,7 @@ define void @s_shuffle_v2i64_v8i64__6_13() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[16:31]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s14, s26
-; GFX942-NEXT:    s_mov_b32 s15, s27
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[26:27]
 ; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
 ; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -27485,8 +28184,7 @@ define void @s_shuffle_v2i64_v8i64__7_13() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s18
-; GFX942-NEXT:    s_mov_b32 s9, s19
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[18:19]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -27499,18 +28197,43 @@ define void @s_shuffle_v2i64_v8i64__7_13() {
 }
 
 define void @s_shuffle_v2i64_v8i64__8_13() {
-; GFX9-LABEL: s_shuffle_v2i64_v8i64__8_13:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:23]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s18
-; GFX9-NEXT:    s_mov_b32 s11, s19
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v8i64__8_13:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:23]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s18
+; GFX900-NEXT:    s_mov_b32 s11, s19
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v8i64__8_13:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:23]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s18
+; GFX90A-NEXT:    s_mov_b32 s11, s19
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v8i64__8_13:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:23]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[18:19]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <8 x i64> asm "; def $0", "=s"()
   %vec1 = call <8 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> <i32 8, i32 13>
@@ -27525,62 +28248,86 @@ define void @s_shuffle_v2i64_v8i64__9_13() {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:19]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s12, s6
-; GFX900-NEXT:    s_mov_b32 s13, s7
-; GFX900-NEXT:    s_mov_b64 s[8:9], s[12:13]
-; GFX900-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX900-NEXT:    s_mov_b32 s12, s6
+; GFX900-NEXT:    s_mov_b32 s13, s7
+; GFX900-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX900-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v8i64__9_13:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:19]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s12, s6
+; GFX90A-NEXT:    s_mov_b32 s13, s7
+; GFX90A-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX90A-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v8i64__9_13:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <8 x i64> asm "; def $0", "=s"()
+  %vec1 = call <8 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> <i32 9, i32 13>
+  call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v2i64_v8i64__10_13() {
+; GFX900-LABEL: s_shuffle_v2i64_v8i64__10_13:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:19]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v2i64_v8i64__9_13:
+; GFX90A-LABEL: s_shuffle_v2i64_v8i64__10_13:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:19]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s12, s6
-; GFX90A-NEXT:    s_mov_b32 s13, s7
-; GFX90A-NEXT:    s_mov_b64 s[8:9], s[12:13]
-; GFX90A-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v2i64_v8i64__9_13:
+; GFX942-LABEL: s_shuffle_v2i64_v8i64__10_13:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:15]
+; GFX942-NEXT:    ; def s[4:19]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <8 x i64> asm "; def $0", "=s"()
-  %vec1 = call <8 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> <i32 9, i32 13>
-  call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v2i64_v8i64__10_13() {
-; GFX9-LABEL: s_shuffle_v2i64_v8i64__10_13:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[4:19]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s14
-; GFX9-NEXT:    s_mov_b32 s11, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <8 x i64> asm "; def $0", "=s"()
   %vec1 = call <8 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> <i32 10, i32 13>
@@ -27625,8 +28372,7 @@ define void @s_shuffle_v2i64_v8i64__11_13() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -27720,8 +28466,7 @@ define void @s_shuffle_v2i64_v8i64__13_13() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -27770,8 +28515,7 @@ define void @s_shuffle_v2i64_v8i64__14_13() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s14, s10
-; GFX942-NEXT:    s_mov_b32 s15, s11
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
 ; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
 ; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -27818,8 +28562,7 @@ define void @s_shuffle_v2i64_v8i64__u_14() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -27874,8 +28617,7 @@ define void @s_shuffle_v2i64_v8i64__0_14() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[12:27]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s24
-; GFX942-NEXT:    s_mov_b32 s11, s25
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[24:25]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -27934,10 +28676,8 @@ define void @s_shuffle_v2i64_v8i64__1_14() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:19]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s16
-; GFX942-NEXT:    s_mov_b32 s11, s17
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[16:17]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -27992,8 +28732,7 @@ define void @s_shuffle_v2i64_v8i64__2_14() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[12:27]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s24
-; GFX942-NEXT:    s_mov_b32 s11, s25
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[24:25]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -28052,10 +28791,8 @@ define void @s_shuffle_v2i64_v8i64__3_14() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:23]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s20
-; GFX942-NEXT:    s_mov_b32 s11, s21
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[20:21]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -28136,8 +28873,7 @@ define void @s_shuffle_v2i64_v8i64__4_14() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[12:27]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s24
-; GFX942-NEXT:    s_mov_b32 s11, s25
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[24:25]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -28214,13 +28950,11 @@ define void @s_shuffle_v2i64_v8i64__5_14() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[12:27]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s10, s24
-; GFX942-NEXT:    s_mov_b32 s11, s25
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[24:25]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -28329,8 +29063,7 @@ define void @s_shuffle_v2i64_v8i64__6_14() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[16:31]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s14, s28
-; GFX942-NEXT:    s_mov_b32 s15, s29
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[28:29]
 ; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
 ; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -28447,10 +29180,8 @@ define void @s_shuffle_v2i64_v8i64__7_14() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[16:31]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s28
-; GFX942-NEXT:    s_mov_b32 s11, s29
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[28:29]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -28469,18 +29200,43 @@ define void @s_shuffle_v2i64_v8i64__7_14() {
 }
 
 define void @s_shuffle_v2i64_v8i64__8_14() {
-; GFX9-LABEL: s_shuffle_v2i64_v8i64__8_14:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:23]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s20
-; GFX9-NEXT:    s_mov_b32 s11, s21
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v8i64__8_14:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:23]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s20
+; GFX900-NEXT:    s_mov_b32 s11, s21
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v8i64__8_14:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:23]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s20
+; GFX90A-NEXT:    s_mov_b32 s11, s21
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v8i64__8_14:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:23]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[20:21]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <8 x i64> asm "; def $0", "=s"()
   %vec1 = call <8 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> <i32 8, i32 14>
@@ -28525,10 +29281,8 @@ define void @s_shuffle_v2i64_v8i64__9_14() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -28541,18 +29295,43 @@ define void @s_shuffle_v2i64_v8i64__9_14() {
 }
 
 define void @s_shuffle_v2i64_v8i64__10_14() {
-; GFX9-LABEL: s_shuffle_v2i64_v8i64__10_14:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[4:19]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s16
-; GFX9-NEXT:    s_mov_b32 s11, s17
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v8i64__10_14:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:19]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s16
+; GFX900-NEXT:    s_mov_b32 s11, s17
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v8i64__10_14:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:19]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s16
+; GFX90A-NEXT:    s_mov_b32 s11, s17
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v8i64__10_14:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[4:19]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[16:17]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <8 x i64> asm "; def $0", "=s"()
   %vec1 = call <8 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> <i32 10, i32 14>
@@ -28597,10 +29376,8 @@ define void @s_shuffle_v2i64_v8i64__11_14() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -28649,8 +29426,7 @@ define void @s_shuffle_v2i64_v8i64__12_14() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -28699,10 +29475,8 @@ define void @s_shuffle_v2i64_v8i64__13_14() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -28751,8 +29525,7 @@ define void @s_shuffle_v2i64_v8i64__14_14() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s14, s12
-; GFX942-NEXT:    s_mov_b32 s15, s13
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
 ; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -28855,8 +29628,7 @@ define void @s_shuffle_v2i64_v8i64__0_15() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[12:27]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s26
-; GFX942-NEXT:    s_mov_b32 s11, s27
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[26:27]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -28908,20 +29680,29 @@ define void @s_shuffle_v2i64_v8i64__1_15() {
 ; GFX942-LABEL: s_shuffle_v2i64_v8i64__1_15:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    s_xor_saveexec_b64 s[0:1], -1
+; GFX942-NEXT:    scratch_store_dword off, v0, s32 ; 4-byte Folded Spill
+; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX942-NEXT:    v_writelane_b32 v0, s30, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_writelane_b32 v0, s31, 1
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[4:19]
+; GFX942-NEXT:    ; def s[16:31]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s16, s2
-; GFX942-NEXT:    s_mov_b32 s17, s3
-; GFX942-NEXT:    s_mov_b64 s[8:9], s[16:17]
-; GFX942-NEXT:    s_mov_b64 s[10:11], s[18:19]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[18:19]
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    v_readlane_b32 s31, v0, 1
+; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
+; GFX942-NEXT:    s_xor_saveexec_b64 s[0:1], -1
+; GFX942-NEXT:    scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
+; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <8 x i64> asm "; def $0", "=s"()
   %vec1 = call <8 x i64> asm "; def $0", "=s"()
@@ -28973,8 +29754,7 @@ define void @s_shuffle_v2i64_v8i64__2_15() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[12:27]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s26
-; GFX942-NEXT:    s_mov_b32 s11, s27
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[26:27]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -29027,16 +29807,15 @@ define void @s_shuffle_v2i64_v8i64__3_15() {
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:15]
+; GFX942-NEXT:    ; def s[12:27]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[8:23]
+; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s20, s6
-; GFX942-NEXT:    s_mov_b32 s21, s7
-; GFX942-NEXT:    s_mov_b64 s[8:9], s[20:21]
-; GFX942-NEXT:    s_mov_b64 s[10:11], s[22:23]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[18:19]
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -29117,8 +29896,7 @@ define void @s_shuffle_v2i64_v8i64__4_15() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[12:27]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s26
-; GFX942-NEXT:    s_mov_b32 s11, s27
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[26:27]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -29193,16 +29971,15 @@ define void @s_shuffle_v2i64_v8i64__5_15() {
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:15]
+; GFX942-NEXT:    ; def s[8:23]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[12:27]
+; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s24, s10
-; GFX942-NEXT:    s_mov_b32 s25, s11
-; GFX942-NEXT:    s_mov_b64 s[8:9], s[24:25]
-; GFX942-NEXT:    s_mov_b64 s[10:11], s[26:27]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[18:19]
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -29311,8 +30088,7 @@ define void @s_shuffle_v2i64_v8i64__6_15() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[16:31]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s14, s30
-; GFX942-NEXT:    s_mov_b32 s15, s31
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[30:31]
 ; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
 ; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -29418,30 +30194,19 @@ define void @s_shuffle_v2i64_v8i64__7_15() {
 ; GFX942-LABEL: s_shuffle_v2i64_v8i64__7_15:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    s_xor_saveexec_b64 s[0:1], -1
-; GFX942-NEXT:    scratch_store_dword off, v0, s32 ; 4-byte Folded Spill
-; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX942-NEXT:    v_writelane_b32 v0, s30, 0
-; GFX942-NEXT:    v_writelane_b32 v0, s31, 1
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[16:31]
+; GFX942-NEXT:    ; def s[4:19]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s28, s14
-; GFX942-NEXT:    s_mov_b32 s29, s15
-; GFX942-NEXT:    s_mov_b64 s[8:9], s[28:29]
-; GFX942-NEXT:    s_mov_b64 s[10:11], s[30:31]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[18:19]
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
-; GFX942-NEXT:    s_xor_saveexec_b64 s[0:1], -1
-; GFX942-NEXT:    scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
-; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <8 x i64> asm "; def $0", "=s"()
   %vec1 = call <8 x i64> asm "; def $0", "=s"()
@@ -29451,18 +30216,43 @@ define void @s_shuffle_v2i64_v8i64__7_15() {
 }
 
 define void @s_shuffle_v2i64_v8i64__8_15() {
-; GFX9-LABEL: s_shuffle_v2i64_v8i64__8_15:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:23]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s22
-; GFX9-NEXT:    s_mov_b32 s11, s23
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v8i64__8_15:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:23]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s22
+; GFX900-NEXT:    s_mov_b32 s11, s23
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v8i64__8_15:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:23]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s22
+; GFX90A-NEXT:    s_mov_b32 s11, s23
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v8i64__8_15:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:23]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[22:23]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <8 x i64> asm "; def $0", "=s"()
   %vec1 = call <8 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> <i32 8, i32 15>
@@ -29507,8 +30297,7 @@ define void @s_shuffle_v2i64_v8i64__9_15() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
 ; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -29523,18 +30312,43 @@ define void @s_shuffle_v2i64_v8i64__9_15() {
 }
 
 define void @s_shuffle_v2i64_v8i64__10_15() {
-; GFX9-LABEL: s_shuffle_v2i64_v8i64__10_15:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[4:19]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s18
-; GFX9-NEXT:    s_mov_b32 s11, s19
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v8i64__10_15:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:19]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s18
+; GFX900-NEXT:    s_mov_b32 s11, s19
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v8i64__10_15:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:19]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s18
+; GFX90A-NEXT:    s_mov_b32 s11, s19
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v8i64__10_15:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[4:19]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[18:19]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <8 x i64> asm "; def $0", "=s"()
   %vec1 = call <8 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> <i32 10, i32 15>
@@ -29579,8 +30393,7 @@ define void @s_shuffle_v2i64_v8i64__11_15() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
 ; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
 ; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -29631,8 +30444,7 @@ define void @s_shuffle_v2i64_v8i64__12_15() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -29681,8 +30493,7 @@ define void @s_shuffle_v2i64_v8i64__13_15() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s10
-; GFX942-NEXT:    s_mov_b32 s13, s11
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
 ; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
 ; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2p0.v2p0.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2p0.v2p0.ll
index 7f8f2dbbb09a1..54e700625d72c 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2p0.v2p0.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2p0.v2p0.ll
@@ -88,8 +88,7 @@ define void @v_shuffle_v2p0_v2p0__1_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v4, 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, v2
-; GFX942-NEXT:    v_mov_b32_e32 v1, v3
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], v[2:3]
 ; GFX942-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -144,8 +143,7 @@ define void @v_shuffle_v2p0_v2p0__3_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v4, 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, v2
-; GFX942-NEXT:    v_mov_b32_e32 v1, v3
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], v[2:3]
 ; GFX942-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -204,10 +202,8 @@ define void @v_shuffle_v2p0_v2p0__3_0(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[2:5]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v4
-; GFX942-NEXT:    v_mov_b32_e32 v3, v5
-; GFX942-NEXT:    v_mov_b32_e32 v4, v0
-; GFX942-NEXT:    v_mov_b32_e32 v5, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[4:5]
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -262,8 +258,7 @@ define void @v_shuffle_v2p0_v2p0__3_1(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[4:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, v6
-; GFX942-NEXT:    v_mov_b32_e32 v1, v7
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], v[6:7]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -309,8 +304,7 @@ define void @v_shuffle_v2p0_v2p0__3_2(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v4, v0
-; GFX942-NEXT:    v_mov_b32_e32 v5, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -355,8 +349,7 @@ define void @v_shuffle_v2p0_v2p0__3_3(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v4, 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, v2
-; GFX942-NEXT:    v_mov_b32_e32 v1, v3
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], v[2:3]
 ; GFX942-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -401,8 +394,7 @@ define void @v_shuffle_v2p0_v2p0__u_0(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v4, 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v0
-; GFX942-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -446,8 +438,7 @@ define void @v_shuffle_v2p0_v2p0__0_0(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v4, 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v0
-; GFX942-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -492,8 +483,7 @@ define void @v_shuffle_v2p0_v2p0__1_0(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v4, v0
-; GFX942-NEXT:    v_mov_b32_e32 v5, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -537,8 +527,7 @@ define void @v_shuffle_v2p0_v2p0__2_0(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v4, 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v0
-; GFX942-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -660,8 +649,7 @@ define void @v_shuffle_v2p0_v2p0__1_1(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v4, 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, v2
-; GFX942-NEXT:    v_mov_b32_e32 v1, v3
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], v[2:3]
 ; GFX942-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -794,8 +782,7 @@ define void @v_shuffle_v2p0_v2p0__1_2(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v4, 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, v2
-; GFX942-NEXT:    v_mov_b32_e32 v1, v3
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], v[2:3]
 ; GFX942-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -900,8 +887,7 @@ define void @v_shuffle_v2p0_v2p0__0_3(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[2:5]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v4
-; GFX942-NEXT:    v_mov_b32_e32 v3, v5
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[4:5]
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -956,8 +942,7 @@ define void @v_shuffle_v2p0_v2p0__1_3(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v4, v2
-; GFX942-NEXT:    v_mov_b32_e32 v5, v3
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[2:3]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1095,8 +1080,7 @@ define void @s_shuffle_v2p0_v2p0__1_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -1154,8 +1138,7 @@ define void @s_shuffle_v2p0_v2p0__3_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -1213,10 +1196,8 @@ define void @s_shuffle_v2p0_v2p0__3_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -1270,8 +1251,7 @@ define void @s_shuffle_v2p0_v2p0__3_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -1320,10 +1300,8 @@ define void @s_shuffle_v2p0_v2p0__3_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -1336,18 +1314,43 @@ define void @s_shuffle_v2p0_v2p0__3_2() {
 }
 
 define void @s_shuffle_v2p0_v2p0__3_3() {
-; GFX9-LABEL: s_shuffle_v2p0_v2p0__3_3:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2p0_v2p0__3_3:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2p0_v2p0__3_3:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2p0_v2p0__3_3:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x ptr> asm "; def $0", "=s"()
   %vec1 = call <2 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <2 x i32> <i32 3, i32 3>
@@ -1388,8 +1391,7 @@ define void @s_shuffle_v2p0_v2p0__u_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -1401,18 +1403,43 @@ define void @s_shuffle_v2p0_v2p0__u_0() {
 }
 
 define void @s_shuffle_v2p0_v2p0__0_0() {
-; GFX9-LABEL: s_shuffle_v2p0_v2p0__0_0:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s8
-; GFX9-NEXT:    s_mov_b32 s11, s9
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2p0_v2p0__0_0:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s8
+; GFX900-NEXT:    s_mov_b32 s11, s9
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2p0_v2p0__0_0:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s8
+; GFX90A-NEXT:    s_mov_b32 s11, s9
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2p0_v2p0__0_0:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[8:9]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> poison, <2 x i32> zeroinitializer
   call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x ptr> %shuf)
@@ -1456,10 +1483,8 @@ define void @s_shuffle_v2p0_v2p0__1_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -1503,8 +1528,7 @@ define void @s_shuffle_v2p0_v2p0__2_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -1596,18 +1620,43 @@ define void @s_shuffle_v2p0_v2p0__0_1() {
 }
 
 define void @s_shuffle_v2p0_v2p0__1_1() {
-; GFX9-LABEL: s_shuffle_v2p0_v2p0__1_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2p0_v2p0__1_1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2p0_v2p0__1_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2p0_v2p0__1_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> poison, <2 x i32> <i32 1, i32 1>
   call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x ptr> %shuf)
@@ -1741,8 +1790,7 @@ define void @s_shuffle_v2p0_v2p0__1_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -1850,8 +1898,7 @@ define void @s_shuffle_v2p0_v2p0__0_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s2
-; GFX942-NEXT:    s_mov_b32 s11, s3
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -1905,8 +1952,7 @@ define void @s_shuffle_v2p0_v2p0__1_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2p0.v3p0.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2p0.v3p0.ll
index 27a6cf11c4cb1..9c770bf1c77cc 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2p0.v3p0.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2p0.v3p0.ll
@@ -127,8 +127,7 @@ define void @v_shuffle_v2p0_v3p0__2_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v6, 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, v4
-; GFX942-NEXT:    v_mov_b32_e32 v1, v5
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], v[4:5]
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -223,8 +222,7 @@ define void @v_shuffle_v2p0_v3p0__5_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v6, 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, v4
-; GFX942-NEXT:    v_mov_b32_e32 v1, v5
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], v[4:5]
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -283,10 +281,8 @@ define void @v_shuffle_v2p0_v3p0__5_0(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[2:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v6
-; GFX942-NEXT:    v_mov_b32_e32 v3, v7
-; GFX942-NEXT:    v_mov_b32_e32 v4, v0
-; GFX942-NEXT:    v_mov_b32_e32 v5, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[6:7]
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -341,8 +337,7 @@ define void @v_shuffle_v2p0_v3p0__5_1(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[4:9]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, v8
-; GFX942-NEXT:    v_mov_b32_e32 v1, v9
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], v[8:9]
 ; GFX942-NEXT:    global_store_dwordx4 v10, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -397,8 +392,7 @@ define void @v_shuffle_v2p0_v3p0__5_2(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[6:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v10
-; GFX942-NEXT:    v_mov_b32_e32 v3, v11
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[10:11]
 ; GFX942-NEXT:    global_store_dwordx4 v12, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -447,10 +441,8 @@ define void @v_shuffle_v2p0_v3p0__5_3(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v6, 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v4
-; GFX942-NEXT:    v_mov_b32_e32 v3, v5
-; GFX942-NEXT:    v_mov_b32_e32 v4, v0
-; GFX942-NEXT:    v_mov_b32_e32 v5, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[4:5]
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -495,8 +487,7 @@ define void @v_shuffle_v2p0_v3p0__5_4(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v6, 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, v4
-; GFX942-NEXT:    v_mov_b32_e32 v1, v5
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], v[4:5]
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -541,8 +532,7 @@ define void @v_shuffle_v2p0_v3p0__5_5(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v6, 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v4
-; GFX942-NEXT:    v_mov_b32_e32 v3, v5
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[4:5]
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -587,8 +577,7 @@ define void @v_shuffle_v2p0_v3p0__u_0(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v6, 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v0
-; GFX942-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -632,8 +621,7 @@ define void @v_shuffle_v2p0_v3p0__0_0(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v6, 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v0
-; GFX942-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -677,8 +665,7 @@ define void @v_shuffle_v2p0_v3p0__1_0(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v6, 0
-; GFX942-NEXT:    v_mov_b32_e32 v4, v0
-; GFX942-NEXT:    v_mov_b32_e32 v5, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -726,10 +713,8 @@ define void @v_shuffle_v2p0_v3p0__2_0(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v6, 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v4
-; GFX942-NEXT:    v_mov_b32_e32 v3, v5
-; GFX942-NEXT:    v_mov_b32_e32 v4, v0
-; GFX942-NEXT:    v_mov_b32_e32 v5, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[4:5]
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -773,8 +758,7 @@ define void @v_shuffle_v2p0_v3p0__3_0(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v6, 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v0
-; GFX942-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -828,8 +812,7 @@ define void @v_shuffle_v2p0_v3p0__4_0(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[2:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v6, v0
-; GFX942-NEXT:    v_mov_b32_e32 v7, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[6:7], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -952,8 +935,7 @@ define void @v_shuffle_v2p0_v3p0__1_1(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v6, 0
-; GFX942-NEXT:    v_mov_b32_e32 v4, v2
-; GFX942-NEXT:    v_mov_b32_e32 v5, v3
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[2:3]
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -997,8 +979,7 @@ define void @v_shuffle_v2p0_v3p0__2_1(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v6, 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, v4
-; GFX942-NEXT:    v_mov_b32_e32 v1, v5
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], v[4:5]
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1091,8 +1072,7 @@ define void @v_shuffle_v2p0_v3p0__4_1(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[4:9]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v8, v2
-; GFX942-NEXT:    v_mov_b32_e32 v9, v3
+; GFX942-NEXT:    v_mov_b64_e32 v[8:9], v[2:3]
 ; GFX942-NEXT:    global_store_dwordx4 v10, v[6:9], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1176,8 +1156,7 @@ define void @v_shuffle_v2p0_v3p0__0_2(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v6, 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v4
-; GFX942-NEXT:    v_mov_b32_e32 v3, v5
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[4:5]
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1260,8 +1239,7 @@ define void @v_shuffle_v2p0_v3p0__2_2(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v6, 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v4
-; GFX942-NEXT:    v_mov_b32_e32 v3, v5
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[4:5]
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1354,8 +1332,7 @@ define void @v_shuffle_v2p0_v3p0__4_2(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v10, v4
-; GFX942-NEXT:    v_mov_b32_e32 v11, v5
+; GFX942-NEXT:    v_mov_b64_e32 v[10:11], v[4:5]
 ; GFX942-NEXT:    global_store_dwordx4 v12, v[8:11], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1489,8 +1466,7 @@ define void @v_shuffle_v2p0_v3p0__2_3(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v6, 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, v4
-; GFX942-NEXT:    v_mov_b32_e32 v1, v5
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], v[4:5]
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1545,8 +1521,7 @@ define void @v_shuffle_v2p0_v3p0__4_3(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v6, 0
-; GFX942-NEXT:    v_mov_b32_e32 v4, v0
-; GFX942-NEXT:    v_mov_b32_e32 v5, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1641,8 +1616,7 @@ define void @v_shuffle_v2p0_v3p0__0_4(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[2:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v4
-; GFX942-NEXT:    v_mov_b32_e32 v3, v5
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[4:5]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1697,8 +1671,7 @@ define void @v_shuffle_v2p0_v3p0__1_4(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[4:9]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v4, v6
-; GFX942-NEXT:    v_mov_b32_e32 v5, v7
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[6:7]
 ; GFX942-NEXT:    global_store_dwordx4 v10, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1753,8 +1726,7 @@ define void @v_shuffle_v2p0_v3p0__2_4(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v6, v4
-; GFX942-NEXT:    v_mov_b32_e32 v7, v5
+; GFX942-NEXT:    v_mov_b64_e32 v[6:7], v[4:5]
 ; GFX942-NEXT:    global_store_dwordx4 v12, v[6:9], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1839,8 +1811,7 @@ define void @v_shuffle_v2p0_v3p0__4_4(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v6, 0
-; GFX942-NEXT:    v_mov_b32_e32 v4, v2
-; GFX942-NEXT:    v_mov_b32_e32 v5, v3
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[2:3]
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1935,8 +1906,7 @@ define void @v_shuffle_v2p0_v3p0__0_5(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[2:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v6
-; GFX942-NEXT:    v_mov_b32_e32 v3, v7
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[6:7]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1991,8 +1961,7 @@ define void @v_shuffle_v2p0_v3p0__1_5(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[4:9]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v4, v8
-; GFX942-NEXT:    v_mov_b32_e32 v5, v9
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[8:9]
 ; GFX942-NEXT:    global_store_dwordx4 v10, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -2047,8 +2016,7 @@ define void @v_shuffle_v2p0_v3p0__2_5(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v8, v4
-; GFX942-NEXT:    v_mov_b32_e32 v9, v5
+; GFX942-NEXT:    v_mov_b64_e32 v[8:9], v[4:5]
 ; GFX942-NEXT:    global_store_dwordx4 v12, v[8:11], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -2093,8 +2061,7 @@ define void @v_shuffle_v2p0_v3p0__3_5(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v6, 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v4
-; GFX942-NEXT:    v_mov_b32_e32 v3, v5
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[4:5]
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -2232,8 +2199,7 @@ define void @s_shuffle_v2p0_v3p0__1_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -2273,8 +2239,7 @@ define void @s_shuffle_v2p0_v3p0__2_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -2332,8 +2297,7 @@ define void @s_shuffle_v2p0_v3p0__4_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -2374,8 +2338,7 @@ define void @s_shuffle_v2p0_v3p0__5_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -2430,11 +2393,11 @@ define void @s_shuffle_v2p0_v3p0__5_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s0
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:9]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -2484,8 +2447,7 @@ define void @s_shuffle_v2p0_v3p0__5_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -2539,10 +2501,8 @@ define void @s_shuffle_v2p0_v3p0__5_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -2587,10 +2547,8 @@ define void @s_shuffle_v2p0_v3p0__5_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -2603,18 +2561,43 @@ define void @s_shuffle_v2p0_v3p0__5_3() {
 }
 
 define void @s_shuffle_v2p0_v3p0__5_4() {
-; GFX9-LABEL: s_shuffle_v2p0_v3p0__5_4:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s12
-; GFX9-NEXT:    s_mov_b32 s9, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2p0_v3p0__5_4:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s12
+; GFX900-NEXT:    s_mov_b32 s9, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2p0_v3p0__5_4:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s12
+; GFX90A-NEXT:    s_mov_b32 s9, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2p0_v3p0__5_4:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x ptr> asm "; def $0", "=s"()
   %vec1 = call <3 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <2 x i32> <i32 5, i32 4>
@@ -2659,10 +2642,8 @@ define void @s_shuffle_v2p0_v3p0__5_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -2707,8 +2688,7 @@ define void @s_shuffle_v2p0_v3p0__u_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -2720,18 +2700,43 @@ define void @s_shuffle_v2p0_v3p0__u_0() {
 }
 
 define void @s_shuffle_v2p0_v3p0__0_0() {
-; GFX9-LABEL: s_shuffle_v2p0_v3p0__0_0:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s8
-; GFX9-NEXT:    s_mov_b32 s11, s9
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2p0_v3p0__0_0:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s8
+; GFX900-NEXT:    s_mov_b32 s11, s9
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2p0_v3p0__0_0:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s8
+; GFX90A-NEXT:    s_mov_b32 s11, s9
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2p0_v3p0__0_0:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[8:9]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <2 x i32> zeroinitializer
   call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x ptr> %shuf)
@@ -2775,10 +2780,8 @@ define void @s_shuffle_v2p0_v3p0__1_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -2822,10 +2825,8 @@ define void @s_shuffle_v2p0_v3p0__2_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -2869,8 +2870,7 @@ define void @s_shuffle_v2p0_v3p0__3_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -2924,13 +2924,11 @@ define void @s_shuffle_v2p0_v3p0__4_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s0
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:9]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3023,18 +3021,43 @@ define void @s_shuffle_v2p0_v3p0__0_1() {
 }
 
 define void @s_shuffle_v2p0_v3p0__1_1() {
-; GFX9-LABEL: s_shuffle_v2p0_v3p0__1_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2p0_v3p0__1_1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2p0_v3p0__1_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2p0_v3p0__1_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <2 x i32> <i32 1, i32 1>
   call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x ptr> %shuf)
@@ -3042,18 +3065,43 @@ define void @s_shuffle_v2p0_v3p0__1_1() {
 }
 
 define void @s_shuffle_v2p0_v3p0__2_1() {
-; GFX9-LABEL: s_shuffle_v2p0_v3p0__2_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s12
-; GFX9-NEXT:    s_mov_b32 s9, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2p0_v3p0__2_1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s12
+; GFX900-NEXT:    s_mov_b32 s9, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2p0_v3p0__2_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s12
+; GFX90A-NEXT:    s_mov_b32 s9, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2p0_v3p0__2_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <2 x i32> <i32 2, i32 1>
   call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x ptr> %shuf)
@@ -3142,8 +3190,7 @@ define void @s_shuffle_v2p0_v3p0__4_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3188,8 +3235,7 @@ define void @s_shuffle_v2p0_v3p0__u_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3201,18 +3247,43 @@ define void @s_shuffle_v2p0_v3p0__u_2() {
 }
 
 define void @s_shuffle_v2p0_v3p0__0_2() {
-; GFX9-LABEL: s_shuffle_v2p0_v3p0__0_2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2p0_v3p0__0_2:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2p0_v3p0__0_2:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2p0_v3p0__0_2:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <2 x i32> <i32 0, i32 2>
   call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x ptr> %shuf)
@@ -3256,10 +3327,8 @@ define void @s_shuffle_v2p0_v3p0__1_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3307,10 +3376,8 @@ define void @s_shuffle_v2p0_v3p0__2_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3354,8 +3421,7 @@ define void @s_shuffle_v2p0_v3p0__3_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3412,10 +3478,8 @@ define void @s_shuffle_v2p0_v3p0__4_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3514,8 +3578,7 @@ define void @s_shuffle_v2p0_v3p0__1_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3555,8 +3618,7 @@ define void @s_shuffle_v2p0_v3p0__2_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3618,10 +3680,8 @@ define void @s_shuffle_v2p0_v3p0__4_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3716,8 +3776,7 @@ define void @s_shuffle_v2p0_v3p0__0_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s2
-; GFX942-NEXT:    s_mov_b32 s11, s3
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3771,8 +3830,7 @@ define void @s_shuffle_v2p0_v3p0__1_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3826,8 +3884,7 @@ define void @s_shuffle_v2p0_v3p0__2_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3881,18 +3938,43 @@ define void @s_shuffle_v2p0_v3p0__3_4() {
 }
 
 define void @s_shuffle_v2p0_v3p0__4_4() {
-; GFX9-LABEL: s_shuffle_v2p0_v3p0__4_4:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2p0_v3p0__4_4:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2p0_v3p0__4_4:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2p0_v3p0__4_4:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x ptr> asm "; def $0", "=s"()
   %vec1 = call <3 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <2 x i32> <i32 4, i32 4>
@@ -3933,8 +4015,7 @@ define void @s_shuffle_v2p0_v3p0__u_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3988,8 +4069,7 @@ define void @s_shuffle_v2p0_v3p0__0_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -4047,10 +4127,8 @@ define void @s_shuffle_v2p0_v3p0__1_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -4104,10 +4182,8 @@ define void @s_shuffle_v2p0_v3p0__2_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -4120,18 +4196,43 @@ define void @s_shuffle_v2p0_v3p0__2_5() {
 }
 
 define void @s_shuffle_v2p0_v3p0__3_5() {
-; GFX9-LABEL: s_shuffle_v2p0_v3p0__3_5:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2p0_v3p0__3_5:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2p0_v3p0__3_5:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2p0_v3p0__3_5:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x ptr> asm "; def $0", "=s"()
   %vec1 = call <3 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <2 x i32> <i32 3, i32 5>
@@ -4176,10 +4277,8 @@ define void @s_shuffle_v2p0_v3p0__4_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2p0.v4p0.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2p0.v4p0.ll
index ae31524ebaa7f..47634638d7674 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2p0.v4p0.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2p0.v4p0.ll
@@ -166,8 +166,7 @@ define void @v_shuffle_v2p0_v4p0__3_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v8, 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, v6
-; GFX942-NEXT:    v_mov_b32_e32 v1, v7
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], v[6:7]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -302,8 +301,7 @@ define void @v_shuffle_v2p0_v4p0__7_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v8, 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, v6
-; GFX942-NEXT:    v_mov_b32_e32 v1, v7
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], v[6:7]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -362,10 +360,8 @@ define void @v_shuffle_v2p0_v4p0__7_0(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[2:9]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v8
-; GFX942-NEXT:    v_mov_b32_e32 v3, v9
-; GFX942-NEXT:    v_mov_b32_e32 v4, v0
-; GFX942-NEXT:    v_mov_b32_e32 v5, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[8:9]
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v10, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -420,8 +416,7 @@ define void @v_shuffle_v2p0_v4p0__7_1(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, v10
-; GFX942-NEXT:    v_mov_b32_e32 v1, v11
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], v[10:11]
 ; GFX942-NEXT:    global_store_dwordx4 v12, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -476,8 +471,7 @@ define void @v_shuffle_v2p0_v4p0__7_2(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[6:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v12
-; GFX942-NEXT:    v_mov_b32_e32 v3, v13
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[12:13]
 ; GFX942-NEXT:    global_store_dwordx4 v14, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -532,8 +526,7 @@ define void @v_shuffle_v2p0_v4p0__7_3(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v4, v14
-; GFX942-NEXT:    v_mov_b32_e32 v5, v15
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[14:15]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -582,10 +575,8 @@ define void @v_shuffle_v2p0_v4p0__7_4(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v8, 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v6
-; GFX942-NEXT:    v_mov_b32_e32 v3, v7
-; GFX942-NEXT:    v_mov_b32_e32 v4, v0
-; GFX942-NEXT:    v_mov_b32_e32 v5, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[6:7]
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -630,8 +621,7 @@ define void @v_shuffle_v2p0_v4p0__7_5(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v8, 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, v6
-; GFX942-NEXT:    v_mov_b32_e32 v1, v7
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], v[6:7]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -676,8 +666,7 @@ define void @v_shuffle_v2p0_v4p0__7_6(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v8, 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v6
-; GFX942-NEXT:    v_mov_b32_e32 v3, v7
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[6:7]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -722,8 +711,7 @@ define void @v_shuffle_v2p0_v4p0__7_7(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v8, 0
-; GFX942-NEXT:    v_mov_b32_e32 v4, v6
-; GFX942-NEXT:    v_mov_b32_e32 v5, v7
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[6:7]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -768,8 +756,7 @@ define void @v_shuffle_v2p0_v4p0__u_0(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v8, 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v0
-; GFX942-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -813,8 +800,7 @@ define void @v_shuffle_v2p0_v4p0__0_0(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v8, 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v0
-; GFX942-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -858,8 +844,7 @@ define void @v_shuffle_v2p0_v4p0__1_0(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v8, 0
-; GFX942-NEXT:    v_mov_b32_e32 v4, v0
-; GFX942-NEXT:    v_mov_b32_e32 v5, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -903,8 +888,7 @@ define void @v_shuffle_v2p0_v4p0__2_0(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v8, 0
-; GFX942-NEXT:    v_mov_b32_e32 v6, v0
-; GFX942-NEXT:    v_mov_b32_e32 v7, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[6:7], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -952,10 +936,8 @@ define void @v_shuffle_v2p0_v4p0__3_0(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v8, 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v6
-; GFX942-NEXT:    v_mov_b32_e32 v3, v7
-; GFX942-NEXT:    v_mov_b32_e32 v4, v0
-; GFX942-NEXT:    v_mov_b32_e32 v5, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[6:7]
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -999,8 +981,7 @@ define void @v_shuffle_v2p0_v4p0__4_0(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v8, 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v0
-; GFX942-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1054,8 +1035,7 @@ define void @v_shuffle_v2p0_v4p0__5_0(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[2:9]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v6, v0
-; GFX942-NEXT:    v_mov_b32_e32 v7, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[6:7], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v10, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1110,8 +1090,7 @@ define void @v_shuffle_v2p0_v4p0__6_0(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[2:9]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v8, v0
-; GFX942-NEXT:    v_mov_b32_e32 v9, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[8:9], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v10, v[6:9], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1234,8 +1213,7 @@ define void @v_shuffle_v2p0_v4p0__1_1(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v8, 0
-; GFX942-NEXT:    v_mov_b32_e32 v4, v2
-; GFX942-NEXT:    v_mov_b32_e32 v5, v3
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[2:3]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1279,8 +1257,7 @@ define void @v_shuffle_v2p0_v4p0__2_1(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v8, 0
-; GFX942-NEXT:    v_mov_b32_e32 v6, v2
-; GFX942-NEXT:    v_mov_b32_e32 v7, v3
+; GFX942-NEXT:    v_mov_b64_e32 v[6:7], v[2:3]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1324,8 +1301,7 @@ define void @v_shuffle_v2p0_v4p0__3_1(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v8, 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, v6
-; GFX942-NEXT:    v_mov_b32_e32 v1, v7
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], v[6:7]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1418,8 +1394,7 @@ define void @v_shuffle_v2p0_v4p0__5_1(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v8, v2
-; GFX942-NEXT:    v_mov_b32_e32 v9, v3
+; GFX942-NEXT:    v_mov_b64_e32 v[8:9], v[2:3]
 ; GFX942-NEXT:    global_store_dwordx4 v12, v[6:9], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1474,8 +1449,7 @@ define void @v_shuffle_v2p0_v4p0__6_1(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v10, v2
-; GFX942-NEXT:    v_mov_b32_e32 v11, v3
+; GFX942-NEXT:    v_mov_b64_e32 v[10:11], v[2:3]
 ; GFX942-NEXT:    global_store_dwordx4 v12, v[8:11], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1559,8 +1533,7 @@ define void @v_shuffle_v2p0_v4p0__0_2(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v8, 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v4
-; GFX942-NEXT:    v_mov_b32_e32 v3, v5
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[4:5]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1643,8 +1616,7 @@ define void @v_shuffle_v2p0_v4p0__2_2(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v8, 0
-; GFX942-NEXT:    v_mov_b32_e32 v6, v4
-; GFX942-NEXT:    v_mov_b32_e32 v7, v5
+; GFX942-NEXT:    v_mov_b64_e32 v[6:7], v[4:5]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1688,8 +1660,7 @@ define void @v_shuffle_v2p0_v4p0__3_2(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v8, 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v6
-; GFX942-NEXT:    v_mov_b32_e32 v3, v7
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[6:7]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1782,8 +1753,7 @@ define void @v_shuffle_v2p0_v4p0__5_2(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[6:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v10, v4
-; GFX942-NEXT:    v_mov_b32_e32 v11, v5
+; GFX942-NEXT:    v_mov_b64_e32 v[10:11], v[4:5]
 ; GFX942-NEXT:    global_store_dwordx4 v14, v[8:11], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1838,8 +1808,7 @@ define void @v_shuffle_v2p0_v4p0__6_2(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[6:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v12, v4
-; GFX942-NEXT:    v_mov_b32_e32 v13, v5
+; GFX942-NEXT:    v_mov_b64_e32 v[12:13], v[4:5]
 ; GFX942-NEXT:    global_store_dwordx4 v14, v[10:13], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1923,8 +1892,7 @@ define void @v_shuffle_v2p0_v4p0__0_3(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v8, 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v6
-; GFX942-NEXT:    v_mov_b32_e32 v3, v7
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[6:7]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1968,8 +1936,7 @@ define void @v_shuffle_v2p0_v4p0__1_3(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v8, 0
-; GFX942-NEXT:    v_mov_b32_e32 v4, v6
-; GFX942-NEXT:    v_mov_b32_e32 v5, v7
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[6:7]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -2052,8 +2019,7 @@ define void @v_shuffle_v2p0_v4p0__3_3(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v8, 0
-; GFX942-NEXT:    v_mov_b32_e32 v4, v6
-; GFX942-NEXT:    v_mov_b32_e32 v5, v7
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[6:7]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -2146,8 +2112,7 @@ define void @v_shuffle_v2p0_v4p0__5_3(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v12, v6
-; GFX942-NEXT:    v_mov_b32_e32 v13, v7
+; GFX942-NEXT:    v_mov_b64_e32 v[12:13], v[6:7]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[10:13], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -2202,8 +2167,7 @@ define void @v_shuffle_v2p0_v4p0__6_3(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v14, v6
-; GFX942-NEXT:    v_mov_b32_e32 v15, v7
+; GFX942-NEXT:    v_mov_b64_e32 v[14:15], v[6:7]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[12:15], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -2376,8 +2340,7 @@ define void @v_shuffle_v2p0_v4p0__3_4(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v8, 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, v6
-; GFX942-NEXT:    v_mov_b32_e32 v1, v7
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], v[6:7]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -2432,8 +2395,7 @@ define void @v_shuffle_v2p0_v4p0__5_4(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v8, 0
-; GFX942-NEXT:    v_mov_b32_e32 v4, v0
-; GFX942-NEXT:    v_mov_b32_e32 v5, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -2478,8 +2440,7 @@ define void @v_shuffle_v2p0_v4p0__6_4(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v8, 0
-; GFX942-NEXT:    v_mov_b32_e32 v6, v0
-; GFX942-NEXT:    v_mov_b32_e32 v7, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[6:7], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -2574,8 +2535,7 @@ define void @v_shuffle_v2p0_v4p0__0_5(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[2:9]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v4
-; GFX942-NEXT:    v_mov_b32_e32 v3, v5
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[4:5]
 ; GFX942-NEXT:    global_store_dwordx4 v10, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -2630,8 +2590,7 @@ define void @v_shuffle_v2p0_v4p0__1_5(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v4, v6
-; GFX942-NEXT:    v_mov_b32_e32 v5, v7
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[6:7]
 ; GFX942-NEXT:    global_store_dwordx4 v12, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -2686,8 +2645,7 @@ define void @v_shuffle_v2p0_v4p0__2_5(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[6:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v6, v8
-; GFX942-NEXT:    v_mov_b32_e32 v7, v9
+; GFX942-NEXT:    v_mov_b64_e32 v[6:7], v[8:9]
 ; GFX942-NEXT:    global_store_dwordx4 v14, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -2742,8 +2700,7 @@ define void @v_shuffle_v2p0_v4p0__3_5(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v8, v6
-; GFX942-NEXT:    v_mov_b32_e32 v9, v7
+; GFX942-NEXT:    v_mov_b64_e32 v[8:9], v[6:7]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -2828,8 +2785,7 @@ define void @v_shuffle_v2p0_v4p0__5_5(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v8, 0
-; GFX942-NEXT:    v_mov_b32_e32 v4, v2
-; GFX942-NEXT:    v_mov_b32_e32 v5, v3
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[2:3]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -2874,8 +2830,7 @@ define void @v_shuffle_v2p0_v4p0__6_5(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v8, 0
-; GFX942-NEXT:    v_mov_b32_e32 v6, v2
-; GFX942-NEXT:    v_mov_b32_e32 v7, v3
+; GFX942-NEXT:    v_mov_b64_e32 v[6:7], v[2:3]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -2970,8 +2925,7 @@ define void @v_shuffle_v2p0_v4p0__0_6(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[2:9]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v6
-; GFX942-NEXT:    v_mov_b32_e32 v3, v7
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[6:7]
 ; GFX942-NEXT:    global_store_dwordx4 v10, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -3026,8 +2980,7 @@ define void @v_shuffle_v2p0_v4p0__1_6(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v4, v8
-; GFX942-NEXT:    v_mov_b32_e32 v5, v9
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[8:9]
 ; GFX942-NEXT:    global_store_dwordx4 v12, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -3082,8 +3035,7 @@ define void @v_shuffle_v2p0_v4p0__2_6(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[6:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v6, v10
-; GFX942-NEXT:    v_mov_b32_e32 v7, v11
+; GFX942-NEXT:    v_mov_b64_e32 v[6:7], v[10:11]
 ; GFX942-NEXT:    global_store_dwordx4 v14, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -3138,8 +3090,7 @@ define void @v_shuffle_v2p0_v4p0__3_6(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v10, v6
-; GFX942-NEXT:    v_mov_b32_e32 v11, v7
+; GFX942-NEXT:    v_mov_b64_e32 v[10:11], v[6:7]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[10:13], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -3184,8 +3135,7 @@ define void @v_shuffle_v2p0_v4p0__4_6(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v8, 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v4
-; GFX942-NEXT:    v_mov_b32_e32 v3, v5
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[4:5]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -3270,8 +3220,7 @@ define void @v_shuffle_v2p0_v4p0__6_6(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v8, 0
-; GFX942-NEXT:    v_mov_b32_e32 v6, v4
-; GFX942-NEXT:    v_mov_b32_e32 v7, v5
+; GFX942-NEXT:    v_mov_b64_e32 v[6:7], v[4:5]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -3366,8 +3315,7 @@ define void @v_shuffle_v2p0_v4p0__0_7(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[2:9]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v8
-; GFX942-NEXT:    v_mov_b32_e32 v3, v9
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[8:9]
 ; GFX942-NEXT:    global_store_dwordx4 v10, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -3422,8 +3370,7 @@ define void @v_shuffle_v2p0_v4p0__1_7(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v4, v10
-; GFX942-NEXT:    v_mov_b32_e32 v5, v11
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[10:11]
 ; GFX942-NEXT:    global_store_dwordx4 v12, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -3478,8 +3425,7 @@ define void @v_shuffle_v2p0_v4p0__2_7(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[6:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v6, v12
-; GFX942-NEXT:    v_mov_b32_e32 v7, v13
+; GFX942-NEXT:    v_mov_b64_e32 v[6:7], v[12:13]
 ; GFX942-NEXT:    global_store_dwordx4 v14, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -3534,8 +3480,7 @@ define void @v_shuffle_v2p0_v4p0__3_7(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v12, v6
-; GFX942-NEXT:    v_mov_b32_e32 v13, v7
+; GFX942-NEXT:    v_mov_b64_e32 v[12:13], v[6:7]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[12:15], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -3580,8 +3525,7 @@ define void @v_shuffle_v2p0_v4p0__4_7(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v8, 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v6
-; GFX942-NEXT:    v_mov_b32_e32 v3, v7
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[6:7]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -3626,8 +3570,7 @@ define void @v_shuffle_v2p0_v4p0__5_7(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v8, 0
-; GFX942-NEXT:    v_mov_b32_e32 v4, v6
-; GFX942-NEXT:    v_mov_b32_e32 v5, v7
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[6:7]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -3765,8 +3708,7 @@ define void @s_shuffle_v2p0_v4p0__1_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3850,8 +3792,7 @@ define void @s_shuffle_v2p0_v4p0__3_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3909,8 +3850,7 @@ define void @s_shuffle_v2p0_v4p0__5_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3996,8 +3936,7 @@ define void @s_shuffle_v2p0_v4p0__7_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -4056,10 +3995,8 @@ define void @s_shuffle_v2p0_v4p0__7_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -4113,8 +4050,7 @@ define void @s_shuffle_v2p0_v4p0__7_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -4172,10 +4108,8 @@ define void @s_shuffle_v2p0_v4p0__7_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -4230,8 +4164,7 @@ define void @s_shuffle_v2p0_v4p0__7_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -4280,10 +4213,8 @@ define void @s_shuffle_v2p0_v4p0__7_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -4296,18 +4227,43 @@ define void @s_shuffle_v2p0_v4p0__7_4() {
 }
 
 define void @s_shuffle_v2p0_v4p0__7_5() {
-; GFX9-LABEL: s_shuffle_v2p0_v4p0__7_5:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2p0_v4p0__7_5:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2p0_v4p0__7_5:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2p0_v4p0__7_5:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <2 x i32> <i32 7, i32 5>
@@ -4352,10 +4308,8 @@ define void @s_shuffle_v2p0_v4p0__7_6() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -4368,18 +4322,43 @@ define void @s_shuffle_v2p0_v4p0__7_6() {
 }
 
 define void @s_shuffle_v2p0_v4p0__7_7() {
-; GFX9-LABEL: s_shuffle_v2p0_v4p0__7_7:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[4:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2p0_v4p0__7_7:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2p0_v4p0__7_7:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2p0_v4p0__7_7:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[4:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <2 x i32> <i32 7, i32 7>
@@ -4420,8 +4399,7 @@ define void @s_shuffle_v2p0_v4p0__u_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -4433,18 +4411,43 @@ define void @s_shuffle_v2p0_v4p0__u_0() {
 }
 
 define void @s_shuffle_v2p0_v4p0__0_0() {
-; GFX9-LABEL: s_shuffle_v2p0_v4p0__0_0:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s8
-; GFX9-NEXT:    s_mov_b32 s11, s9
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2p0_v4p0__0_0:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s8
+; GFX900-NEXT:    s_mov_b32 s11, s9
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2p0_v4p0__0_0:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s8
+; GFX90A-NEXT:    s_mov_b32 s11, s9
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2p0_v4p0__0_0:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[8:9]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <2 x i32> zeroinitializer
   call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x ptr> %shuf)
@@ -4488,10 +4491,8 @@ define void @s_shuffle_v2p0_v4p0__1_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -4503,18 +4504,43 @@ define void @s_shuffle_v2p0_v4p0__1_0() {
 }
 
 define void @s_shuffle_v2p0_v4p0__2_0() {
-; GFX9-LABEL: s_shuffle_v2p0_v4p0__2_0:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[4:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s4
-; GFX9-NEXT:    s_mov_b32 s11, s5
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2p0_v4p0__2_0:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s4
+; GFX900-NEXT:    s_mov_b32 s11, s5
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2p0_v4p0__2_0:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s4
+; GFX90A-NEXT:    s_mov_b32 s11, s5
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2p0_v4p0__2_0:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[4:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <2 x i32> <i32 2, i32 0>
   call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x ptr> %shuf)
@@ -4558,10 +4584,8 @@ define void @s_shuffle_v2p0_v4p0__3_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -4605,8 +4629,7 @@ define void @s_shuffle_v2p0_v4p0__4_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -4664,10 +4687,8 @@ define void @s_shuffle_v2p0_v4p0__5_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -4722,8 +4743,7 @@ define void @s_shuffle_v2p0_v4p0__6_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -4816,18 +4836,43 @@ define void @s_shuffle_v2p0_v4p0__0_1() {
 }
 
 define void @s_shuffle_v2p0_v4p0__1_1() {
-; GFX9-LABEL: s_shuffle_v2p0_v4p0__1_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2p0_v4p0__1_1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2p0_v4p0__1_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2p0_v4p0__1_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <2 x i32> <i32 1, i32 1>
   call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x ptr> %shuf)
@@ -4835,18 +4880,43 @@ define void @s_shuffle_v2p0_v4p0__1_1() {
 }
 
 define void @s_shuffle_v2p0_v4p0__2_1() {
-; GFX9-LABEL: s_shuffle_v2p0_v4p0__2_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[4:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s6
-; GFX9-NEXT:    s_mov_b32 s11, s7
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2p0_v4p0__2_1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s6
+; GFX900-NEXT:    s_mov_b32 s11, s7
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2p0_v4p0__2_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s6
+; GFX90A-NEXT:    s_mov_b32 s11, s7
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2p0_v4p0__2_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[4:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <2 x i32> <i32 2, i32 1>
   call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x ptr> %shuf)
@@ -4854,18 +4924,43 @@ define void @s_shuffle_v2p0_v4p0__2_1() {
 }
 
 define void @s_shuffle_v2p0_v4p0__3_1() {
-; GFX9-LABEL: s_shuffle_v2p0_v4p0__3_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2p0_v4p0__3_1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2p0_v4p0__3_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2p0_v4p0__3_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <2 x i32> <i32 3, i32 1>
   call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x ptr> %shuf)
@@ -4954,8 +5049,7 @@ define void @s_shuffle_v2p0_v4p0__5_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5010,8 +5104,7 @@ define void @s_shuffle_v2p0_v4p0__6_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s2
-; GFX942-NEXT:    s_mov_b32 s11, s3
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5056,8 +5149,7 @@ define void @s_shuffle_v2p0_v4p0__u_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5069,18 +5161,43 @@ define void @s_shuffle_v2p0_v4p0__u_2() {
 }
 
 define void @s_shuffle_v2p0_v4p0__0_2() {
-; GFX9-LABEL: s_shuffle_v2p0_v4p0__0_2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2p0_v4p0__0_2:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2p0_v4p0__0_2:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2p0_v4p0__0_2:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <2 x i32> <i32 0, i32 2>
   call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x ptr> %shuf)
@@ -5124,10 +5241,8 @@ define void @s_shuffle_v2p0_v4p0__1_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5139,18 +5254,43 @@ define void @s_shuffle_v2p0_v4p0__1_2() {
 }
 
 define void @s_shuffle_v2p0_v4p0__2_2() {
-; GFX9-LABEL: s_shuffle_v2p0_v4p0__2_2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[4:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s8
-; GFX9-NEXT:    s_mov_b32 s11, s9
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2p0_v4p0__2_2:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s8
+; GFX900-NEXT:    s_mov_b32 s11, s9
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2p0_v4p0__2_2:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s8
+; GFX90A-NEXT:    s_mov_b32 s11, s9
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2p0_v4p0__2_2:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[4:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[8:9]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <2 x i32> <i32 2, i32 2>
   call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x ptr> %shuf)
@@ -5194,10 +5334,8 @@ define void @s_shuffle_v2p0_v4p0__3_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5241,8 +5379,7 @@ define void @s_shuffle_v2p0_v4p0__4_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5299,10 +5436,8 @@ define void @s_shuffle_v2p0_v4p0__5_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5357,8 +5492,7 @@ define void @s_shuffle_v2p0_v4p0__6_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5411,18 +5545,43 @@ define void @s_shuffle_v2p0_v4p0__u_3() {
 }
 
 define void @s_shuffle_v2p0_v4p0__0_3() {
-; GFX9-LABEL: s_shuffle_v2p0_v4p0__0_3:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s14
-; GFX9-NEXT:    s_mov_b32 s11, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2p0_v4p0__0_3:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2p0_v4p0__0_3:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2p0_v4p0__0_3:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <2 x i32> <i32 0, i32 3>
   call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x ptr> %shuf)
@@ -5430,18 +5589,43 @@ define void @s_shuffle_v2p0_v4p0__0_3() {
 }
 
 define void @s_shuffle_v2p0_v4p0__1_3() {
-; GFX9-LABEL: s_shuffle_v2p0_v4p0__1_3:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[4:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s6
-; GFX9-NEXT:    s_mov_b32 s9, s7
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2p0_v4p0__1_3:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s6
+; GFX900-NEXT:    s_mov_b32 s9, s7
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2p0_v4p0__1_3:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s6
+; GFX90A-NEXT:    s_mov_b32 s9, s7
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2p0_v4p0__1_3:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[4:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <2 x i32> <i32 1, i32 3>
   call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x ptr> %shuf)
@@ -5489,18 +5673,43 @@ define void @s_shuffle_v2p0_v4p0__2_3() {
 }
 
 define void @s_shuffle_v2p0_v4p0__3_3() {
-; GFX9-LABEL: s_shuffle_v2p0_v4p0__3_3:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[4:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2p0_v4p0__3_3:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2p0_v4p0__3_3:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2p0_v4p0__3_3:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[4:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <2 x i32> <i32 3, i32 3>
   call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x ptr> %shuf)
@@ -5590,8 +5799,7 @@ define void @s_shuffle_v2p0_v4p0__5_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5646,8 +5854,7 @@ define void @s_shuffle_v2p0_v4p0__6_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5746,8 +5953,7 @@ define void @s_shuffle_v2p0_v4p0__1_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5831,8 +6037,7 @@ define void @s_shuffle_v2p0_v4p0__3_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5894,10 +6099,8 @@ define void @s_shuffle_v2p0_v4p0__5_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5910,18 +6113,43 @@ define void @s_shuffle_v2p0_v4p0__5_4() {
 }
 
 define void @s_shuffle_v2p0_v4p0__6_4() {
-; GFX9-LABEL: s_shuffle_v2p0_v4p0__6_4:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[4:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s4
-; GFX9-NEXT:    s_mov_b32 s11, s5
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2p0_v4p0__6_4:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s4
+; GFX900-NEXT:    s_mov_b32 s11, s5
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2p0_v4p0__6_4:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s4
+; GFX90A-NEXT:    s_mov_b32 s11, s5
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2p0_v4p0__6_4:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[4:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <2 x i32> <i32 6, i32 4>
@@ -6012,8 +6240,7 @@ define void @s_shuffle_v2p0_v4p0__0_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s2
-; GFX942-NEXT:    s_mov_b32 s11, s3
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -6067,8 +6294,7 @@ define void @s_shuffle_v2p0_v4p0__1_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -6123,8 +6349,7 @@ define void @s_shuffle_v2p0_v4p0__2_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s2
-; GFX942-NEXT:    s_mov_b32 s11, s3
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -6178,8 +6403,7 @@ define void @s_shuffle_v2p0_v4p0__3_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -6233,18 +6457,43 @@ define void @s_shuffle_v2p0_v4p0__4_5() {
 }
 
 define void @s_shuffle_v2p0_v4p0__5_5() {
-; GFX9-LABEL: s_shuffle_v2p0_v4p0__5_5:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2p0_v4p0__5_5:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2p0_v4p0__5_5:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2p0_v4p0__5_5:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <2 x i32> <i32 5, i32 5>
@@ -6253,18 +6502,43 @@ define void @s_shuffle_v2p0_v4p0__5_5() {
 }
 
 define void @s_shuffle_v2p0_v4p0__6_5() {
-; GFX9-LABEL: s_shuffle_v2p0_v4p0__6_5:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[4:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s6
-; GFX9-NEXT:    s_mov_b32 s11, s7
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2p0_v4p0__6_5:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s6
+; GFX900-NEXT:    s_mov_b32 s11, s7
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2p0_v4p0__6_5:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s6
+; GFX90A-NEXT:    s_mov_b32 s11, s7
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2p0_v4p0__6_5:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[4:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <2 x i32> <i32 6, i32 5>
@@ -6305,8 +6579,7 @@ define void @s_shuffle_v2p0_v4p0__u_6() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -6360,8 +6633,7 @@ define void @s_shuffle_v2p0_v4p0__0_6() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -6419,10 +6691,8 @@ define void @s_shuffle_v2p0_v4p0__1_6() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -6477,8 +6747,7 @@ define void @s_shuffle_v2p0_v4p0__2_6() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -6536,10 +6805,8 @@ define void @s_shuffle_v2p0_v4p0__3_6() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -6552,18 +6819,43 @@ define void @s_shuffle_v2p0_v4p0__3_6() {
 }
 
 define void @s_shuffle_v2p0_v4p0__4_6() {
-; GFX9-LABEL: s_shuffle_v2p0_v4p0__4_6:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2p0_v4p0__4_6:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2p0_v4p0__4_6:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2p0_v4p0__4_6:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <2 x i32> <i32 4, i32 6>
@@ -6608,10 +6900,8 @@ define void @s_shuffle_v2p0_v4p0__5_6() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -6624,18 +6914,43 @@ define void @s_shuffle_v2p0_v4p0__5_6() {
 }
 
 define void @s_shuffle_v2p0_v4p0__6_6() {
-; GFX9-LABEL: s_shuffle_v2p0_v4p0__6_6:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[4:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s8
-; GFX9-NEXT:    s_mov_b32 s11, s9
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2p0_v4p0__6_6:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s8
+; GFX900-NEXT:    s_mov_b32 s11, s9
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2p0_v4p0__6_6:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s8
+; GFX90A-NEXT:    s_mov_b32 s11, s9
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2p0_v4p0__6_6:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[4:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[8:9]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <2 x i32> <i32 6, i32 6>
@@ -6726,8 +7041,7 @@ define void @s_shuffle_v2p0_v4p0__0_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -6782,8 +7096,7 @@ define void @s_shuffle_v2p0_v4p0__1_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -6838,8 +7151,7 @@ define void @s_shuffle_v2p0_v4p0__2_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -6894,8 +7206,7 @@ define void @s_shuffle_v2p0_v4p0__3_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -6908,18 +7219,43 @@ define void @s_shuffle_v2p0_v4p0__3_7() {
 }
 
 define void @s_shuffle_v2p0_v4p0__4_7() {
-; GFX9-LABEL: s_shuffle_v2p0_v4p0__4_7:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s14
-; GFX9-NEXT:    s_mov_b32 s11, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2p0_v4p0__4_7:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2p0_v4p0__4_7:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2p0_v4p0__4_7:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <2 x i32> <i32 4, i32 7>
@@ -6928,18 +7264,43 @@ define void @s_shuffle_v2p0_v4p0__4_7() {
 }
 
 define void @s_shuffle_v2p0_v4p0__5_7() {
-; GFX9-LABEL: s_shuffle_v2p0_v4p0__5_7:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[4:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s6
-; GFX9-NEXT:    s_mov_b32 s9, s7
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2p0_v4p0__5_7:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s6
+; GFX900-NEXT:    s_mov_b32 s9, s7
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2p0_v4p0__5_7:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s6
+; GFX90A-NEXT:    s_mov_b32 s9, s7
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2p0_v4p0__5_7:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[4:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <2 x i32> <i32 5, i32 7>
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3i64.v2i64.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3i64.v2i64.ll
index a15fc3212f474..1b3cf9c6eba0f 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3i64.v2i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3i64.v2i64.ll
@@ -45,6 +45,7 @@ define void @v_shuffle_v3i64_v2i64__0_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -84,10 +85,11 @@ define void @v_shuffle_v3i64_v2i64__1_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v3i64_v2i64__1_u_u:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1] offset:16
 ; GFX942-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX942-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX942-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
@@ -140,10 +142,11 @@ define void @v_shuffle_v3i64_v2i64__3_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v3i64_v2i64__3_u_u:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1] offset:16
 ; GFX942-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX942-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX942-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
@@ -203,7 +206,7 @@ define void @v_shuffle_v3i64_v2i64__3_0_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[2:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    global_store_dwordx2 v6, v[0:1], s[0:1] offset:16
 ; GFX942-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX942-NEXT:    v_mov_b32_e32 v3, v5
 ; GFX942-NEXT:    v_mov_b32_e32 v4, v0
@@ -254,14 +257,14 @@ define void @v_shuffle_v3i64_v2i64__3_1_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v3i64_v2i64__3_1_u:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[4:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    global_store_dwordx2 v8, v[0:1], s[0:1] offset:16
 ; GFX942-NEXT:    v_mov_b32_e32 v0, v6
 ; GFX942-NEXT:    v_mov_b32_e32 v1, v7
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
@@ -308,7 +311,7 @@ define void @v_shuffle_v3i64_v2i64__3_2_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    global_store_dwordx2 v6, v[0:1], s[0:1] offset:16
 ; GFX942-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX942-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[2:5], s[0:1]
@@ -351,10 +354,11 @@ define void @v_shuffle_v3i64_v2i64__3_3_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v3i64_v2i64__3_3_u:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1] offset:16
 ; GFX942-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX942-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX942-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
@@ -1472,6 +1476,7 @@ define void @v_shuffle_v3i64_v2i64__0_2_2(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1511,10 +1516,11 @@ define void @v_shuffle_v3i64_v2i64__1_2_2(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v3i64_v2i64__1_2_2:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1] offset:16
 ; GFX942-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX942-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX942-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
@@ -2266,8 +2272,7 @@ define void @s_shuffle_v3i64_v2i64__1_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -2325,8 +2330,7 @@ define void @s_shuffle_v3i64_v2i64__3_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -2384,10 +2388,8 @@ define void @s_shuffle_v3i64_v2i64__3_0_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -2441,8 +2443,7 @@ define void @s_shuffle_v3i64_v2i64__3_1_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -2491,10 +2492,8 @@ define void @s_shuffle_v3i64_v2i64__3_2_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -2507,18 +2506,43 @@ define void @s_shuffle_v3i64_v2i64__3_2_u() {
 }
 
 define void @s_shuffle_v3i64_v2i64__3_3_u() {
-; GFX9-LABEL: s_shuffle_v3i64_v2i64__3_3_u:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v3i64_v2i64__3_3_u:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3i64_v2i64__3_3_u:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3i64_v2i64__3_3_u:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x i64> asm "; def $0", "=s"()
   %vec1 = call <2 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <3 x i32> <i32 3, i32 3, i32 poison>
@@ -2572,10 +2596,8 @@ define void @s_shuffle_v3i64_v2i64__3_3_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -2633,10 +2655,8 @@ define void @s_shuffle_v3i64_v2i64__3_3_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -2689,12 +2709,9 @@ define void @s_shuffle_v3i64_v2i64__3_3_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s2
-; GFX942-NEXT:    s_mov_b32 s11, s3
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -2707,20 +2724,48 @@ define void @s_shuffle_v3i64_v2i64__3_3_2() {
 }
 
 define void @s_shuffle_v3i64_v2i64__3_3_3() {
-; GFX9-LABEL: s_shuffle_v3i64_v2i64__3_3_3:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v3i64_v2i64__3_3_3:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3i64_v2i64__3_3_3:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3i64_v2i64__3_3_3:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x i64> asm "; def $0", "=s"()
   %vec1 = call <2 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <3 x i32> <i32 3, i32 3, i32 3>
@@ -2765,10 +2810,8 @@ define void @s_shuffle_v3i64_v2i64__u_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -2780,20 +2823,48 @@ define void @s_shuffle_v3i64_v2i64__u_0_0() {
 }
 
 define void @s_shuffle_v3i64_v2i64__0_0_0() {
-; GFX9-LABEL: s_shuffle_v3i64_v2i64__0_0_0:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s8
-; GFX9-NEXT:    s_mov_b32 s11, s9
-; GFX9-NEXT:    s_mov_b32 s12, s8
-; GFX9-NEXT:    s_mov_b32 s13, s9
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v3i64_v2i64__0_0_0:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s8
+; GFX900-NEXT:    s_mov_b32 s11, s9
+; GFX900-NEXT:    s_mov_b32 s12, s8
+; GFX900-NEXT:    s_mov_b32 s13, s9
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3i64_v2i64__0_0_0:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s8
+; GFX90A-NEXT:    s_mov_b32 s11, s9
+; GFX90A-NEXT:    s_mov_b32 s12, s8
+; GFX90A-NEXT:    s_mov_b32 s13, s9
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3i64_v2i64__0_0_0:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[8:9]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[8:9]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x i64> %vec0, <2 x i64> poison, <3 x i32> zeroinitializer
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
@@ -2841,12 +2912,9 @@ define void @s_shuffle_v3i64_v2i64__1_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -2894,10 +2962,8 @@ define void @s_shuffle_v3i64_v2i64__2_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -2958,12 +3024,9 @@ define void @s_shuffle_v3i64_v2i64__3_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3021,10 +3084,8 @@ define void @s_shuffle_v3i64_v2i64__3_u_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3086,12 +3147,9 @@ define void @s_shuffle_v3i64_v2i64__3_1_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s2
-; GFX942-NEXT:    s_mov_b32 s11, s3
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3153,12 +3211,9 @@ define void @s_shuffle_v3i64_v2i64__3_2_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3171,18 +3226,43 @@ define void @s_shuffle_v3i64_v2i64__3_2_0() {
 }
 
 define void @s_shuffle_v3i64_v2i64__u_1_1() {
-; GFX9-LABEL: s_shuffle_v3i64_v2i64__u_1_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v3i64_v2i64__u_1_1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3i64_v2i64__u_1_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3i64_v2i64__u_1_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x i64> %vec0, <2 x i64> poison, <3 x i32> <i32 poison, i32 1, i32 1>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
@@ -3190,18 +3270,43 @@ define void @s_shuffle_v3i64_v2i64__u_1_1() {
 }
 
 define void @s_shuffle_v3i64_v2i64__0_1_1() {
-; GFX9-LABEL: s_shuffle_v3i64_v2i64__0_1_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v3i64_v2i64__0_1_1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3i64_v2i64__0_1_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3i64_v2i64__0_1_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x i64> %vec0, <2 x i64> poison, <3 x i32> <i32 0, i32 1, i32 1>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
@@ -3209,20 +3314,48 @@ define void @s_shuffle_v3i64_v2i64__0_1_1() {
 }
 
 define void @s_shuffle_v3i64_v2i64__1_1_1() {
-; GFX9-LABEL: s_shuffle_v3i64_v2i64__1_1_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v3i64_v2i64__1_1_1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3i64_v2i64__1_1_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3i64_v2i64__1_1_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x i64> %vec0, <2 x i64> poison, <3 x i32> <i32 1, i32 1, i32 1>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
@@ -3230,18 +3363,43 @@ define void @s_shuffle_v3i64_v2i64__1_1_1() {
 }
 
 define void @s_shuffle_v3i64_v2i64__2_1_1() {
-; GFX9-LABEL: s_shuffle_v3i64_v2i64__2_1_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v3i64_v2i64__2_1_1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3i64_v2i64__2_1_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3i64_v2i64__2_1_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x i64> %vec0, <2 x i64> poison, <3 x i32> <i32 2, i32 1, i32 1>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
@@ -3294,10 +3452,8 @@ define void @s_shuffle_v3i64_v2i64__3_1_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s12, s10
-; GFX942-NEXT:    s_mov_b32 s13, s11
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3355,10 +3511,8 @@ define void @s_shuffle_v3i64_v2i64__3_u_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3420,12 +3574,9 @@ define void @s_shuffle_v3i64_v2i64__3_0_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3487,12 +3638,9 @@ define void @s_shuffle_v3i64_v2i64__3_2_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3591,8 +3739,7 @@ define void @s_shuffle_v3i64_v2i64__1_2_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3658,12 +3805,9 @@ define void @s_shuffle_v3i64_v2i64__3_2_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3712,10 +3856,8 @@ define void @s_shuffle_v3i64_v2i64__3_u_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3773,12 +3915,9 @@ define void @s_shuffle_v3i64_v2i64__3_0_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s4
-; GFX942-NEXT:    s_mov_b32 s13, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3836,10 +3975,8 @@ define void @s_shuffle_v3i64_v2i64__3_1_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3852,18 +3989,43 @@ define void @s_shuffle_v3i64_v2i64__3_1_2() {
 }
 
 define void @s_shuffle_v3i64_v2i64__u_3_3() {
-; GFX9-LABEL: s_shuffle_v3i64_v2i64__u_3_3:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v3i64_v2i64__u_3_3:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3i64_v2i64__u_3_3:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3i64_v2i64__u_3_3:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x i64> asm "; def $0", "=s"()
   %vec1 = call <2 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <3 x i32> <i32 poison, i32 3, i32 3>
@@ -3917,10 +4079,8 @@ define void @s_shuffle_v3i64_v2i64__0_3_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s2
-; GFX942-NEXT:    s_mov_b32 s11, s3
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3978,10 +4138,8 @@ define void @s_shuffle_v3i64_v2i64__1_3_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s12, s10
-; GFX942-NEXT:    s_mov_b32 s13, s11
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3994,18 +4152,43 @@ define void @s_shuffle_v3i64_v2i64__1_3_3() {
 }
 
 define void @s_shuffle_v3i64_v2i64__2_3_3() {
-; GFX9-LABEL: s_shuffle_v3i64_v2i64__2_3_3:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v3i64_v2i64__2_3_3:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3i64_v2i64__2_3_3:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3i64_v2i64__2_3_3:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x i64> asm "; def $0", "=s"()
   %vec1 = call <2 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <3 x i32> <i32 2, i32 3, i32 3>
@@ -4050,10 +4233,8 @@ define void @s_shuffle_v3i64_v2i64__3_u_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -4115,12 +4296,9 @@ define void @s_shuffle_v3i64_v2i64__3_0_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -4178,10 +4356,8 @@ define void @s_shuffle_v3i64_v2i64__3_1_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -4234,12 +4410,9 @@ define void @s_shuffle_v3i64_v2i64__3_2_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3i64.v3i64.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3i64.v3i64.ll
index f15dd7d2772e5..0ff4daf6feb33 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3i64.v3i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3i64.v3i64.ll
@@ -87,6 +87,7 @@ define void @v_shuffle_v3i64_v3i64__1_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    global_store_dwordx2 v6, v[0:1], s[0:1] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -126,10 +127,11 @@ define void @v_shuffle_v3i64_v3i64__2_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v3i64_v3i64__2_u_u:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-NEXT:    global_store_dwordx2 v6, v[0:1], s[0:1] offset:16
 ; GFX942-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX942-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1]
@@ -182,6 +184,7 @@ define void @v_shuffle_v3i64_v3i64__4_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    global_store_dwordx2 v6, v[0:1], s[0:1] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -222,10 +225,11 @@ define void @v_shuffle_v3i64_v3i64__5_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v3i64_v3i64__5_u_u:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-NEXT:    global_store_dwordx2 v6, v[0:1], s[0:1] offset:16
 ; GFX942-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX942-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1]
@@ -285,7 +289,7 @@ define void @v_shuffle_v3i64_v3i64__5_0_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[2:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    global_store_dwordx2 v8, v[0:1], s[0:1] offset:16
 ; GFX942-NEXT:    v_mov_b32_e32 v2, v6
 ; GFX942-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX942-NEXT:    v_mov_b32_e32 v4, v0
@@ -336,10 +340,11 @@ define void @v_shuffle_v3i64_v3i64__5_1_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v3i64_v3i64__5_1_u:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v10, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_mov_b32_e32 v10, 0
+; GFX942-NEXT:    global_store_dwordx2 v10, v[0:1], s[0:1] offset:16
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[4:9]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -399,7 +404,7 @@ define void @v_shuffle_v3i64_v3i64__5_2_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[6:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    global_store_dwordx2 v12, v[0:1], s[0:1] offset:16
 ; GFX942-NEXT:    v_mov_b32_e32 v2, v10
 ; GFX942-NEXT:    v_mov_b32_e32 v3, v11
 ; GFX942-NEXT:    global_store_dwordx4 v12, v[2:5], s[0:1]
@@ -454,6 +459,7 @@ define void @v_shuffle_v3i64_v3i64__5_3_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    v_mov_b32_e32 v3, v5
 ; GFX942-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX942-NEXT:    v_mov_b32_e32 v5, v1
+; GFX942-NEXT:    global_store_dwordx2 v6, v[0:1], s[0:1] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -494,10 +500,11 @@ define void @v_shuffle_v3i64_v3i64__5_4_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v3i64_v3i64__5_4_u:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-NEXT:    global_store_dwordx2 v6, v[0:1], s[0:1] offset:16
 ; GFX942-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX942-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1]
@@ -546,6 +553,7 @@ define void @v_shuffle_v3i64_v3i64__5_5_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX942-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX942-NEXT:    v_mov_b32_e32 v3, v5
+; GFX942-NEXT:    global_store_dwordx2 v6, v[0:1], s[0:1] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -2902,6 +2910,7 @@ define void @v_shuffle_v3i64_v3i64__1_3_3(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    global_store_dwordx2 v6, v[0:1], s[0:1] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -2941,10 +2950,11 @@ define void @v_shuffle_v3i64_v3i64__2_3_3(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v3i64_v3i64__2_3_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-NEXT:    global_store_dwordx2 v6, v[0:1], s[0:1] offset:16
 ; GFX942-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX942-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1]
@@ -4720,8 +4730,7 @@ define void @s_shuffle_v3i64_v3i64__1_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -4761,8 +4770,7 @@ define void @s_shuffle_v3i64_v3i64__2_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -4820,8 +4828,7 @@ define void @s_shuffle_v3i64_v3i64__4_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -4862,8 +4869,7 @@ define void @s_shuffle_v3i64_v3i64__5_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -4918,11 +4924,11 @@ define void @s_shuffle_v3i64_v3i64__5_0_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s0
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:9]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -4972,8 +4978,7 @@ define void @s_shuffle_v3i64_v3i64__5_1_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5027,10 +5032,8 @@ define void @s_shuffle_v3i64_v3i64__5_2_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5075,10 +5078,8 @@ define void @s_shuffle_v3i64_v3i64__5_3_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5091,18 +5092,43 @@ define void @s_shuffle_v3i64_v3i64__5_3_u() {
 }
 
 define void @s_shuffle_v3i64_v3i64__5_4_u() {
-; GFX9-LABEL: s_shuffle_v3i64_v3i64__5_4_u:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s12
-; GFX9-NEXT:    s_mov_b32 s9, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v3i64_v3i64__5_4_u:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s12
+; GFX900-NEXT:    s_mov_b32 s9, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3i64_v3i64__5_4_u:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s12
+; GFX90A-NEXT:    s_mov_b32 s9, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3i64_v3i64__5_4_u:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x i64> asm "; def $0", "=s"()
   %vec1 = call <3 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> <i32 5, i32 4, i32 poison>
@@ -5147,10 +5173,8 @@ define void @s_shuffle_v3i64_v3i64__5_5_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5212,12 +5236,9 @@ define void @s_shuffle_v3i64_v3i64__5_5_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5279,12 +5300,9 @@ define void @s_shuffle_v3i64_v3i64__5_5_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5342,10 +5360,8 @@ define void @s_shuffle_v3i64_v3i64__5_5_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5394,12 +5410,9 @@ define void @s_shuffle_v3i64_v3i64__5_5_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5452,12 +5465,9 @@ define void @s_shuffle_v3i64_v3i64__5_5_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5470,20 +5480,48 @@ define void @s_shuffle_v3i64_v3i64__5_5_4() {
 }
 
 define void @s_shuffle_v3i64_v3i64__5_5_5() {
-; GFX9-LABEL: s_shuffle_v3i64_v3i64__5_5_5:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s12
-; GFX9-NEXT:    s_mov_b32 s9, s13
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v3i64_v3i64__5_5_5:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s12
+; GFX900-NEXT:    s_mov_b32 s9, s13
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3i64_v3i64__5_5_5:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s12
+; GFX90A-NEXT:    s_mov_b32 s9, s13
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3i64_v3i64__5_5_5:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x i64> asm "; def $0", "=s"()
   %vec1 = call <3 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> <i32 5, i32 5, i32 5>
@@ -5528,10 +5566,8 @@ define void @s_shuffle_v3i64_v3i64__u_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5543,20 +5579,48 @@ define void @s_shuffle_v3i64_v3i64__u_0_0() {
 }
 
 define void @s_shuffle_v3i64_v3i64__0_0_0() {
-; GFX9-LABEL: s_shuffle_v3i64_v3i64__0_0_0:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s8
-; GFX9-NEXT:    s_mov_b32 s11, s9
-; GFX9-NEXT:    s_mov_b32 s12, s8
-; GFX9-NEXT:    s_mov_b32 s13, s9
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v3i64_v3i64__0_0_0:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s8
+; GFX900-NEXT:    s_mov_b32 s11, s9
+; GFX900-NEXT:    s_mov_b32 s12, s8
+; GFX900-NEXT:    s_mov_b32 s13, s9
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3i64_v3i64__0_0_0:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s8
+; GFX90A-NEXT:    s_mov_b32 s11, s9
+; GFX90A-NEXT:    s_mov_b32 s12, s8
+; GFX90A-NEXT:    s_mov_b32 s13, s9
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3i64_v3i64__0_0_0:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[8:9]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[8:9]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <3 x i32> zeroinitializer
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
@@ -5604,12 +5668,9 @@ define void @s_shuffle_v3i64_v3i64__1_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5657,12 +5718,9 @@ define void @s_shuffle_v3i64_v3i64__2_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5710,10 +5768,8 @@ define void @s_shuffle_v3i64_v3i64__3_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5771,15 +5827,12 @@ define void @s_shuffle_v3i64_v3i64__4_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s0
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:9]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5838,13 +5891,11 @@ define void @s_shuffle_v3i64_v3i64__5_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s0
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:9]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5899,11 +5950,11 @@ define void @s_shuffle_v3i64_v3i64__5_u_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s0
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:9]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5962,13 +6013,11 @@ define void @s_shuffle_v3i64_v3i64__5_1_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s2
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:9]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s11, s3
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -6022,12 +6071,9 @@ define void @s_shuffle_v3i64_v3i64__5_2_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -6086,13 +6132,11 @@ define void @s_shuffle_v3i64_v3i64__5_3_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s0
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:9]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -6150,10 +6194,8 @@ define void @s_shuffle_v3i64_v3i64__5_4_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -6166,116 +6208,12 @@ define void @s_shuffle_v3i64_v3i64__5_4_0() {
 }
 
 define void @s_shuffle_v3i64_v3i64__u_1_1() {
-; GFX9-LABEL: s_shuffle_v3i64_v3i64__u_1_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <3 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <3 x i32> <i32 poison, i32 1, i32 1>
-  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v3i64_v3i64__0_1_1() {
-; GFX9-LABEL: s_shuffle_v3i64_v3i64__0_1_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <3 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <3 x i32> <i32 0, i32 1, i32 1>
-  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v3i64_v3i64__1_1_1() {
-; GFX9-LABEL: s_shuffle_v3i64_v3i64__1_1_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <3 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <3 x i32> <i32 1, i32 1, i32 1>
-  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v3i64_v3i64__2_1_1() {
-; GFX9-LABEL: s_shuffle_v3i64_v3i64__2_1_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s12
-; GFX9-NEXT:    s_mov_b32 s9, s13
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <3 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <3 x i32> <i32 2, i32 1, i32 1>
-  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v3i64_v3i64__3_1_1() {
-; GFX9-LABEL: s_shuffle_v3i64_v3i64__3_1_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <3 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <3 x i32> <i32 3, i32 1, i32 1>
-  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v3i64_v3i64__4_1_1() {
-; GFX900-LABEL: s_shuffle_v3i64_v3i64__4_1_1:
+; GFX900-LABEL: s_shuffle_v3i64_v3i64__u_1_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:9]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s6
-; GFX900-NEXT:    s_mov_b32 s9, s7
 ; GFX900-NEXT:    s_mov_b32 s12, s10
 ; GFX900-NEXT:    s_mov_b32 s13, s11
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -6283,17 +6221,12 @@ define void @s_shuffle_v3i64_v3i64__4_1_1() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3i64_v3i64__4_1_1:
+; GFX90A-LABEL: s_shuffle_v3i64_v3i64__u_1_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:9]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s6
-; GFX90A-NEXT:    s_mov_b32 s9, s7
 ; GFX90A-NEXT:    s_mov_b32 s12, s10
 ; GFX90A-NEXT:    s_mov_b32 s13, s11
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -6301,40 +6234,30 @@ define void @s_shuffle_v3i64_v3i64__4_1_1() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3i64_v3i64__4_1_1:
+; GFX942-LABEL: s_shuffle_v3i64_v3i64__u_1_1:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:5]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s12, s10
-; GFX942-NEXT:    s_mov_b32 s13, s11
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x i64> asm "; def $0", "=s"()
-  %vec1 = call <3 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> <i32 4, i32 1, i32 1>
+  %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <3 x i32> <i32 poison, i32 1, i32 1>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3i64_v3i64__5_1_1() {
-; GFX900-LABEL: s_shuffle_v3i64_v3i64__5_1_1:
+define void @s_shuffle_v3i64_v3i64__0_1_1() {
+; GFX900-LABEL: s_shuffle_v3i64_v3i64__0_1_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:9]
-; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s12, s10
 ; GFX900-NEXT:    s_mov_b32 s13, s11
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -6342,15 +6265,12 @@ define void @s_shuffle_v3i64_v3i64__5_1_1() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3i64_v3i64__5_1_1:
+; GFX90A-LABEL: s_shuffle_v3i64_v3i64__0_1_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:9]
-; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s12, s10
 ; GFX90A-NEXT:    s_mov_b32 s13, s11
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -6358,19 +6278,268 @@ define void @s_shuffle_v3i64_v3i64__5_1_1() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3i64_v3i64__5_1_1:
+; GFX942-LABEL: s_shuffle_v3i64_v3i64__0_1_1:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:5]
+; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s12, s10
-; GFX942-NEXT:    s_mov_b32 s13, s11
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <3 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <3 x i32> <i32 0, i32 1, i32 1>
+  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v3i64_v3i64__1_1_1() {
+; GFX900-LABEL: s_shuffle_v3i64_v3i64__1_1_1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3i64_v3i64__1_1_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3i64_v3i64__1_1_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <3 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <3 x i32> <i32 1, i32 1, i32 1>
+  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v3i64_v3i64__2_1_1() {
+; GFX900-LABEL: s_shuffle_v3i64_v3i64__2_1_1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s12
+; GFX900-NEXT:    s_mov_b32 s9, s13
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3i64_v3i64__2_1_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s12
+; GFX90A-NEXT:    s_mov_b32 s9, s13
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3i64_v3i64__2_1_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <3 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <3 x i32> <i32 2, i32 1, i32 1>
+  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v3i64_v3i64__3_1_1() {
+; GFX900-LABEL: s_shuffle_v3i64_v3i64__3_1_1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3i64_v3i64__3_1_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3i64_v3i64__3_1_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <3 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <3 x i32> <i32 3, i32 1, i32 1>
+  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v3i64_v3i64__4_1_1() {
+; GFX900-LABEL: s_shuffle_v3i64_v3i64__4_1_1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:9]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s6
+; GFX900-NEXT:    s_mov_b32 s9, s7
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3i64_v3i64__4_1_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:9]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s6
+; GFX90A-NEXT:    s_mov_b32 s9, s7
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3i64_v3i64__4_1_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:5]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <3 x i64> asm "; def $0", "=s"()
+  %vec1 = call <3 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> <i32 4, i32 1, i32 1>
+  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v3i64_v3i64__5_1_1() {
+; GFX900-LABEL: s_shuffle_v3i64_v3i64__5_1_1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:9]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3i64_v3i64__5_1_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:9]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3i64_v3i64__5_1_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:5]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -6425,11 +6594,11 @@ define void @s_shuffle_v3i64_v3i64__5_u_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s2
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:9]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -6488,13 +6657,11 @@ define void @s_shuffle_v3i64_v3i64__5_0_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s0
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:9]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -6552,12 +6719,9 @@ define void @s_shuffle_v3i64_v3i64__5_2_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -6577,217 +6741,344 @@ define void @s_shuffle_v3i64_v3i64__5_3_1() {
 ; GFX900-NEXT:    ; def s[4:9]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:17]
+; GFX900-NEXT:    ; def s[12:17]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s16
+; GFX900-NEXT:    s_mov_b32 s9, s17
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s12, s6
+; GFX900-NEXT:    s_mov_b32 s13, s7
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3i64_v3i64__5_3_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:9]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[12:17]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s16
+; GFX90A-NEXT:    s_mov_b32 s9, s17
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s12, s6
+; GFX90A-NEXT:    s_mov_b32 s13, s7
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3i64_v3i64__5_3_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:5]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[4:9]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <3 x i64> asm "; def $0", "=s"()
+  %vec1 = call <3 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> <i32 5, i32 3, i32 1>
+  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v3i64_v3i64__5_4_1() {
+; GFX900-LABEL: s_shuffle_v3i64_v3i64__5_4_1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:9]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s12
+; GFX900-NEXT:    s_mov_b32 s9, s13
+; GFX900-NEXT:    s_mov_b32 s12, s6
+; GFX900-NEXT:    s_mov_b32 s13, s7
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3i64_v3i64__5_4_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:9]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s12
+; GFX90A-NEXT:    s_mov_b32 s9, s13
+; GFX90A-NEXT:    s_mov_b32 s12, s6
+; GFX90A-NEXT:    s_mov_b32 s13, s7
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3i64_v3i64__5_4_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:5]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <3 x i64> asm "; def $0", "=s"()
+  %vec1 = call <3 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> <i32 5, i32 4, i32 1>
+  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v3i64_v3i64__u_2_2() {
+; GFX900-LABEL: s_shuffle_v3i64_v3i64__u_2_2:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3i64_v3i64__u_2_2:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3i64_v3i64__u_2_2:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <3 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <3 x i32> <i32 poison, i32 2, i32 2>
+  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v3i64_v3i64__0_2_2() {
+; GFX900-LABEL: s_shuffle_v3i64_v3i64__0_2_2:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3i64_v3i64__0_2_2:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3i64_v3i64__0_2_2:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <3 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <3 x i32> <i32 0, i32 2, i32 2>
+  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v3i64_v3i64__1_2_2() {
+; GFX900-LABEL: s_shuffle_v3i64_v3i64__1_2_2:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3i64_v3i64__1_2_2:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3i64_v3i64__1_2_2:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <3 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <3 x i32> <i32 1, i32 2, i32 2>
+  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v3i64_v3i64__2_2_2() {
+; GFX900-LABEL: s_shuffle_v3i64_v3i64__2_2_2:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s16
-; GFX900-NEXT:    s_mov_b32 s9, s17
+; GFX900-NEXT:    s_mov_b32 s8, s12
+; GFX900-NEXT:    s_mov_b32 s9, s13
 ; GFX900-NEXT:    s_mov_b32 s10, s12
 ; GFX900-NEXT:    s_mov_b32 s11, s13
-; GFX900-NEXT:    s_mov_b32 s12, s6
-; GFX900-NEXT:    s_mov_b32 s13, s7
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3i64_v3i64__5_3_1:
+; GFX90A-LABEL: s_shuffle_v3i64_v3i64__2_2_2:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:9]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:17]
+; GFX90A-NEXT:    ; def s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s16
-; GFX90A-NEXT:    s_mov_b32 s9, s17
+; GFX90A-NEXT:    s_mov_b32 s8, s12
+; GFX90A-NEXT:    s_mov_b32 s9, s13
 ; GFX90A-NEXT:    s_mov_b32 s10, s12
 ; GFX90A-NEXT:    s_mov_b32 s11, s13
-; GFX90A-NEXT:    s_mov_b32 s12, s6
-; GFX90A-NEXT:    s_mov_b32 s13, s7
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3i64_v3i64__5_3_1:
+; GFX942-LABEL: s_shuffle_v3i64_v3i64__2_2_2:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:5]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[4:9]
+; GFX942-NEXT:    ; def s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x i64> asm "; def $0", "=s"()
-  %vec1 = call <3 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> <i32 5, i32 3, i32 1>
+  %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <3 x i32> <i32 2, i32 2, i32 2>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3i64_v3i64__5_4_1() {
-; GFX900-LABEL: s_shuffle_v3i64_v3i64__5_4_1:
+define void @s_shuffle_v3i64_v3i64__3_2_2() {
+; GFX900-LABEL: s_shuffle_v3i64_v3i64__3_2_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:9]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s12
-; GFX900-NEXT:    s_mov_b32 s9, s13
-; GFX900-NEXT:    s_mov_b32 s12, s6
-; GFX900-NEXT:    s_mov_b32 s13, s7
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3i64_v3i64__5_4_1:
+; GFX90A-LABEL: s_shuffle_v3i64_v3i64__3_2_2:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:9]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s12
-; GFX90A-NEXT:    s_mov_b32 s9, s13
-; GFX90A-NEXT:    s_mov_b32 s12, s6
-; GFX90A-NEXT:    s_mov_b32 s13, s7
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3i64_v3i64__5_4_1:
+; GFX942-LABEL: s_shuffle_v3i64_v3i64__3_2_2:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:5]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <3 x i64> asm "; def $0", "=s"()
-  %vec1 = call <3 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> <i32 5, i32 4, i32 1>
-  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v3i64_v3i64__u_2_2() {
-; GFX9-LABEL: s_shuffle_v3i64_v3i64__u_2_2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <3 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <3 x i32> <i32 poison, i32 2, i32 2>
-  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v3i64_v3i64__0_2_2() {
-; GFX9-LABEL: s_shuffle_v3i64_v3i64__0_2_2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <3 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <3 x i32> <i32 0, i32 2, i32 2>
-  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v3i64_v3i64__1_2_2() {
-; GFX9-LABEL: s_shuffle_v3i64_v3i64__1_2_2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <3 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <3 x i32> <i32 1, i32 2, i32 2>
-  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v3i64_v3i64__2_2_2() {
-; GFX9-LABEL: s_shuffle_v3i64_v3i64__2_2_2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s12
-; GFX9-NEXT:    s_mov_b32 s9, s13
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <3 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <3 x i32> <i32 2, i32 2, i32 2>
-  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v3i64_v3i64__3_2_2() {
-; GFX9-LABEL: s_shuffle_v3i64_v3i64__3_2_2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <3 x i32> <i32 3, i32 2, i32 2>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
@@ -6840,10 +7131,8 @@ define void @s_shuffle_v3i64_v3i64__4_2_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -6897,10 +7186,8 @@ define void @s_shuffle_v3i64_v3i64__5_2_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -6950,8 +7237,7 @@ define void @s_shuffle_v3i64_v3i64__5_u_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -7009,12 +7295,9 @@ define void @s_shuffle_v3i64_v3i64__5_0_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s4
-; GFX942-NEXT:    s_mov_b32 s13, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -7064,8 +7347,7 @@ define void @s_shuffle_v3i64_v3i64__5_1_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -7119,10 +7401,8 @@ define void @s_shuffle_v3i64_v3i64__5_3_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -7180,10 +7460,8 @@ define void @s_shuffle_v3i64_v3i64__5_4_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s12, s4
-; GFX942-NEXT:    s_mov_b32 s13, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -7282,8 +7560,7 @@ define void @s_shuffle_v3i64_v3i64__1_3_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -7323,8 +7600,7 @@ define void @s_shuffle_v3i64_v3i64__2_3_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -7390,12 +7666,9 @@ define void @s_shuffle_v3i64_v3i64__4_3_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -7444,12 +7717,9 @@ define void @s_shuffle_v3i64_v3i64__5_3_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -7494,10 +7764,8 @@ define void @s_shuffle_v3i64_v3i64__5_u_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -7552,13 +7820,11 @@ define void @s_shuffle_v3i64_v3i64__5_0_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s0
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:9]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s4
-; GFX942-NEXT:    s_mov_b32 s13, s5
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -7612,10 +7878,8 @@ define void @s_shuffle_v3i64_v3i64__5_1_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -7673,10 +7937,8 @@ define void @s_shuffle_v3i64_v3i64__5_2_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[12:17]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s16
-; GFX942-NEXT:    s_mov_b32 s9, s17
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[16:17]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -7725,12 +7987,9 @@ define void @s_shuffle_v3i64_v3i64__5_4_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s2
-; GFX942-NEXT:    s_mov_b32 s11, s3
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -7743,18 +8002,43 @@ define void @s_shuffle_v3i64_v3i64__5_4_3() {
 }
 
 define void @s_shuffle_v3i64_v3i64__u_4_4() {
-; GFX9-LABEL: s_shuffle_v3i64_v3i64__u_4_4:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v3i64_v3i64__u_4_4:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3i64_v3i64__u_4_4:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3i64_v3i64__u_4_4:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x i64> asm "; def $0", "=s"()
   %vec1 = call <3 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> <i32 poison, i32 4, i32 4>
@@ -7808,10 +8092,8 @@ define void @s_shuffle_v3i64_v3i64__0_4_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s2
-; GFX942-NEXT:    s_mov_b32 s11, s3
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -7869,10 +8151,8 @@ define void @s_shuffle_v3i64_v3i64__1_4_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s12, s10
-; GFX942-NEXT:    s_mov_b32 s13, s11
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -7930,10 +8210,8 @@ define void @s_shuffle_v3i64_v3i64__2_4_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s12, s10
-; GFX942-NEXT:    s_mov_b32 s13, s11
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -7946,18 +8224,43 @@ define void @s_shuffle_v3i64_v3i64__2_4_4() {
 }
 
 define void @s_shuffle_v3i64_v3i64__3_4_4() {
-; GFX9-LABEL: s_shuffle_v3i64_v3i64__3_4_4:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v3i64_v3i64__3_4_4:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3i64_v3i64__3_4_4:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3i64_v3i64__3_4_4:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x i64> asm "; def $0", "=s"()
   %vec1 = call <3 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> <i32 3, i32 4, i32 4>
@@ -7966,20 +8269,48 @@ define void @s_shuffle_v3i64_v3i64__3_4_4() {
 }
 
 define void @s_shuffle_v3i64_v3i64__4_4_4() {
-; GFX9-LABEL: s_shuffle_v3i64_v3i64__4_4_4:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v3i64_v3i64__4_4_4:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3i64_v3i64__4_4_4:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3i64_v3i64__4_4_4:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x i64> asm "; def $0", "=s"()
   %vec1 = call <3 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> <i32 4, i32 4, i32 4>
@@ -7988,20 +8319,48 @@ define void @s_shuffle_v3i64_v3i64__4_4_4() {
 }
 
 define void @s_shuffle_v3i64_v3i64__5_4_4() {
-; GFX9-LABEL: s_shuffle_v3i64_v3i64__5_4_4:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s12
-; GFX9-NEXT:    s_mov_b32 s9, s13
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v3i64_v3i64__5_4_4:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s12
+; GFX900-NEXT:    s_mov_b32 s9, s13
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3i64_v3i64__5_4_4:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s12
+; GFX90A-NEXT:    s_mov_b32 s9, s13
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3i64_v3i64__5_4_4:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x i64> asm "; def $0", "=s"()
   %vec1 = call <3 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> <i32 5, i32 4, i32 4>
@@ -8042,10 +8401,8 @@ define void @s_shuffle_v3i64_v3i64__5_u_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8104,13 +8461,11 @@ define void @s_shuffle_v3i64_v3i64__5_0_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s0
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:9]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8164,10 +8519,8 @@ define void @s_shuffle_v3i64_v3i64__5_1_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8225,12 +8578,9 @@ define void @s_shuffle_v3i64_v3i64__5_2_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s16
-; GFX942-NEXT:    s_mov_b32 s9, s17
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[16:17]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8279,12 +8629,9 @@ define void @s_shuffle_v3i64_v3i64__5_3_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8297,18 +8644,43 @@ define void @s_shuffle_v3i64_v3i64__5_3_4() {
 }
 
 define void @s_shuffle_v3i64_v3i64__u_5_5() {
-; GFX9-LABEL: s_shuffle_v3i64_v3i64__u_5_5:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v3i64_v3i64__u_5_5:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3i64_v3i64__u_5_5:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3i64_v3i64__u_5_5:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x i64> asm "; def $0", "=s"()
   %vec1 = call <3 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> <i32 poison, i32 5, i32 5>
@@ -8362,10 +8734,8 @@ define void @s_shuffle_v3i64_v3i64__0_5_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s4
-; GFX942-NEXT:    s_mov_b32 s13, s5
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8423,10 +8793,8 @@ define void @s_shuffle_v3i64_v3i64__1_5_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8484,10 +8852,8 @@ define void @s_shuffle_v3i64_v3i64__2_5_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8500,18 +8866,43 @@ define void @s_shuffle_v3i64_v3i64__2_5_5() {
 }
 
 define void @s_shuffle_v3i64_v3i64__3_5_5() {
-; GFX9-LABEL: s_shuffle_v3i64_v3i64__3_5_5:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v3i64_v3i64__3_5_5:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3i64_v3i64__3_5_5:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3i64_v3i64__3_5_5:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x i64> asm "; def $0", "=s"()
   %vec1 = call <3 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> <i32 3, i32 5, i32 5>
@@ -8520,20 +8911,48 @@ define void @s_shuffle_v3i64_v3i64__3_5_5() {
 }
 
 define void @s_shuffle_v3i64_v3i64__4_5_5() {
-; GFX9-LABEL: s_shuffle_v3i64_v3i64__4_5_5:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v3i64_v3i64__4_5_5:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3i64_v3i64__4_5_5:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3i64_v3i64__4_5_5:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x i64> asm "; def $0", "=s"()
   %vec1 = call <3 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> <i32 4, i32 5, i32 5>
@@ -8542,18 +8961,43 @@ define void @s_shuffle_v3i64_v3i64__4_5_5() {
 }
 
 define void @s_shuffle_v3i64_v3i64__5_u_5() {
-; GFX9-LABEL: s_shuffle_v3i64_v3i64__5_u_5:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s12
-; GFX9-NEXT:    s_mov_b32 s9, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v3i64_v3i64__5_u_5:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s12
+; GFX900-NEXT:    s_mov_b32 s9, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3i64_v3i64__5_u_5:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s12
+; GFX90A-NEXT:    s_mov_b32 s9, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3i64_v3i64__5_u_5:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x i64> asm "; def $0", "=s"()
   %vec1 = call <3 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> <i32 5, i32 poison, i32 5>
@@ -8607,10 +9051,8 @@ define void @s_shuffle_v3i64_v3i64__5_0_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8668,10 +9110,8 @@ define void @s_shuffle_v3i64_v3i64__5_1_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s12, s4
-; GFX942-NEXT:    s_mov_b32 s13, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8729,10 +9169,8 @@ define void @s_shuffle_v3i64_v3i64__5_2_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8785,12 +9223,9 @@ define void @s_shuffle_v3i64_v3i64__5_3_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s4
-; GFX942-NEXT:    s_mov_b32 s13, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8803,18 +9238,43 @@ define void @s_shuffle_v3i64_v3i64__5_3_5() {
 }
 
 define void @s_shuffle_v3i64_v3i64__5_4_5() {
-; GFX9-LABEL: s_shuffle_v3i64_v3i64__5_4_5:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s12
-; GFX9-NEXT:    s_mov_b32 s9, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v3i64_v3i64__5_4_5:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s12
+; GFX900-NEXT:    s_mov_b32 s9, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3i64_v3i64__5_4_5:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s12
+; GFX90A-NEXT:    s_mov_b32 s9, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3i64_v3i64__5_4_5:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x i64> asm "; def $0", "=s"()
   %vec1 = call <3 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> <i32 5, i32 4, i32 5>
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3i64.v4i64.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3i64.v4i64.ll
index 6e156d2d4a2f5..58f1e346b6503 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3i64.v4i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3i64.v4i64.ll
@@ -87,6 +87,7 @@ define void @v_shuffle_v3i64_v4i64__1_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    global_store_dwordx2 v8, v[0:1], s[0:1] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -126,6 +127,7 @@ define void @v_shuffle_v3i64_v4i64__2_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    global_store_dwordx2 v8, v[0:1], s[0:1] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -165,10 +167,11 @@ define void @v_shuffle_v3i64_v4i64__3_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v3i64_v4i64__3_u_u:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_mov_b32_e32 v8, 0
+; GFX942-NEXT:    global_store_dwordx2 v8, v[0:1], s[0:1] offset:16
 ; GFX942-NEXT:    v_mov_b32_e32 v0, v6
 ; GFX942-NEXT:    v_mov_b32_e32 v1, v7
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
@@ -221,6 +224,7 @@ define void @v_shuffle_v3i64_v4i64__5_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    global_store_dwordx2 v8, v[0:1], s[0:1] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -261,6 +265,7 @@ define void @v_shuffle_v3i64_v4i64__6_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    global_store_dwordx2 v8, v[0:1], s[0:1] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -301,10 +306,11 @@ define void @v_shuffle_v3i64_v4i64__7_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v3i64_v4i64__7_u_u:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_mov_b32_e32 v8, 0
+; GFX942-NEXT:    global_store_dwordx2 v8, v[0:1], s[0:1] offset:16
 ; GFX942-NEXT:    v_mov_b32_e32 v0, v6
 ; GFX942-NEXT:    v_mov_b32_e32 v1, v7
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
@@ -364,7 +370,7 @@ define void @v_shuffle_v3i64_v4i64__7_0_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[2:9]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    global_store_dwordx2 v10, v[0:1], s[0:1] offset:16
 ; GFX942-NEXT:    v_mov_b32_e32 v2, v8
 ; GFX942-NEXT:    v_mov_b32_e32 v3, v9
 ; GFX942-NEXT:    v_mov_b32_e32 v4, v0
@@ -415,10 +421,11 @@ define void @v_shuffle_v3i64_v4i64__7_1_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v3i64_v4i64__7_1_u:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v12, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_mov_b32_e32 v12, 0
+; GFX942-NEXT:    global_store_dwordx2 v12, v[0:1], s[0:1] offset:16
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -478,7 +485,7 @@ define void @v_shuffle_v3i64_v4i64__7_2_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[6:13]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    global_store_dwordx2 v14, v[0:1], s[0:1] offset:16
 ; GFX942-NEXT:    v_mov_b32_e32 v2, v12
 ; GFX942-NEXT:    v_mov_b32_e32 v3, v13
 ; GFX942-NEXT:    global_store_dwordx4 v14, v[2:5], s[0:1]
@@ -534,7 +541,7 @@ define void @v_shuffle_v3i64_v4i64__7_3_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    global_store_dwordx2 v16, v[0:1], s[0:1] offset:16
 ; GFX942-NEXT:    v_mov_b32_e32 v4, v14
 ; GFX942-NEXT:    v_mov_b32_e32 v5, v15
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1]
@@ -589,6 +596,7 @@ define void @v_shuffle_v3i64_v4i64__7_4_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX942-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX942-NEXT:    v_mov_b32_e32 v5, v1
+; GFX942-NEXT:    global_store_dwordx2 v8, v[0:1], s[0:1] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -629,10 +637,11 @@ define void @v_shuffle_v3i64_v4i64__7_5_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v3i64_v4i64__7_5_u:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_mov_b32_e32 v8, 0
+; GFX942-NEXT:    global_store_dwordx2 v8, v[0:1], s[0:1] offset:16
 ; GFX942-NEXT:    v_mov_b32_e32 v0, v6
 ; GFX942-NEXT:    v_mov_b32_e32 v1, v7
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
@@ -681,6 +690,7 @@ define void @v_shuffle_v3i64_v4i64__7_6_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-NEXT:    v_mov_b32_e32 v2, v6
 ; GFX942-NEXT:    v_mov_b32_e32 v3, v7
+; GFX942-NEXT:    global_store_dwordx2 v8, v[0:1], s[0:1] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -727,6 +737,7 @@ define void @v_shuffle_v3i64_v4i64__7_7_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-NEXT:    v_mov_b32_e32 v4, v6
 ; GFX942-NEXT:    v_mov_b32_e32 v5, v7
+; GFX942-NEXT:    global_store_dwordx2 v8, v[0:1], s[0:1] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -4719,6 +4730,7 @@ define void @v_shuffle_v3i64_v4i64__1_4_4(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    global_store_dwordx2 v8, v[0:1], s[0:1] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -4758,6 +4770,7 @@ define void @v_shuffle_v3i64_v4i64__2_4_4(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    global_store_dwordx2 v8, v[0:1], s[0:1] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -4797,10 +4810,11 @@ define void @v_shuffle_v3i64_v4i64__3_4_4(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v3i64_v4i64__3_4_4:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_mov_b32_e32 v8, 0
+; GFX942-NEXT:    global_store_dwordx2 v8, v[0:1], s[0:1] offset:16
 ; GFX942-NEXT:    v_mov_b32_e32 v0, v6
 ; GFX942-NEXT:    v_mov_b32_e32 v1, v7
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
@@ -8016,8 +8030,7 @@ define void @s_shuffle_v3i64_v4i64__1_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8057,8 +8070,7 @@ define void @s_shuffle_v3i64_v4i64__2_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8102,8 +8114,7 @@ define void @s_shuffle_v3i64_v4i64__3_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8161,8 +8172,7 @@ define void @s_shuffle_v3i64_v4i64__5_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8203,8 +8213,7 @@ define void @s_shuffle_v3i64_v4i64__6_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8249,8 +8258,7 @@ define void @s_shuffle_v3i64_v4i64__7_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8309,10 +8317,8 @@ define void @s_shuffle_v3i64_v4i64__7_0_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8366,8 +8372,7 @@ define void @s_shuffle_v3i64_v4i64__7_1_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8425,10 +8430,8 @@ define void @s_shuffle_v3i64_v4i64__7_2_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8482,10 +8485,8 @@ define void @s_shuffle_v3i64_v4i64__7_3_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8534,10 +8535,8 @@ define void @s_shuffle_v3i64_v4i64__7_4_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8550,18 +8549,43 @@ define void @s_shuffle_v3i64_v4i64__7_4_u() {
 }
 
 define void @s_shuffle_v3i64_v4i64__7_5_u() {
-; GFX9-LABEL: s_shuffle_v3i64_v4i64__7_5_u:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_5_u:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_5_u:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_5_u:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 5, i32 poison>
@@ -8606,10 +8630,8 @@ define void @s_shuffle_v3i64_v4i64__7_6_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8654,10 +8676,8 @@ define void @s_shuffle_v3i64_v4i64__7_7_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8716,13 +8736,11 @@ define void @s_shuffle_v3i64_v4i64__7_7_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s0
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8781,13 +8799,11 @@ define void @s_shuffle_v3i64_v4i64__7_7_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s2
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8841,10 +8857,8 @@ define void @s_shuffle_v3i64_v4i64__7_7_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8902,12 +8916,9 @@ define void @s_shuffle_v3i64_v4i64__7_7_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8956,12 +8967,9 @@ define void @s_shuffle_v3i64_v4i64__7_7_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -9010,12 +9018,9 @@ define void @s_shuffle_v3i64_v4i64__7_7_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -9028,20 +9033,48 @@ define void @s_shuffle_v3i64_v4i64__7_7_5() {
 }
 
 define void @s_shuffle_v3i64_v4i64__7_7_6() {
-; GFX9-LABEL: s_shuffle_v3i64_v4i64__7_7_6:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    s_mov_b32 s10, s14
-; GFX9-NEXT:    s_mov_b32 s11, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_7_6:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_7_6:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_7_6:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 7, i32 6>
@@ -9090,12 +9123,9 @@ define void @s_shuffle_v3i64_v4i64__7_7_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -9144,10 +9174,8 @@ define void @s_shuffle_v3i64_v4i64__u_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -9159,20 +9187,48 @@ define void @s_shuffle_v3i64_v4i64__u_0_0() {
 }
 
 define void @s_shuffle_v3i64_v4i64__0_0_0() {
-; GFX9-LABEL: s_shuffle_v3i64_v4i64__0_0_0:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s8
-; GFX9-NEXT:    s_mov_b32 s11, s9
-; GFX9-NEXT:    s_mov_b32 s12, s8
-; GFX9-NEXT:    s_mov_b32 s13, s9
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__0_0_0:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s8
+; GFX900-NEXT:    s_mov_b32 s11, s9
+; GFX900-NEXT:    s_mov_b32 s12, s8
+; GFX900-NEXT:    s_mov_b32 s13, s9
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__0_0_0:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s8
+; GFX90A-NEXT:    s_mov_b32 s11, s9
+; GFX90A-NEXT:    s_mov_b32 s12, s8
+; GFX90A-NEXT:    s_mov_b32 s13, s9
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__0_0_0:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[8:9]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[8:9]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> zeroinitializer
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
@@ -9220,12 +9276,9 @@ define void @s_shuffle_v3i64_v4i64__1_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -9273,12 +9326,9 @@ define void @s_shuffle_v3i64_v4i64__2_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -9330,12 +9380,9 @@ define void @s_shuffle_v3i64_v4i64__3_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -9383,10 +9430,8 @@ define void @s_shuffle_v3i64_v4i64__4_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -9444,15 +9489,12 @@ define void @s_shuffle_v3i64_v4i64__5_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s0
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -9511,13 +9553,11 @@ define void @s_shuffle_v3i64_v4i64__6_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s0
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -9576,15 +9616,12 @@ define void @s_shuffle_v3i64_v4i64__7_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s0
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -9639,13 +9676,11 @@ define void @s_shuffle_v3i64_v4i64__7_u_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s0
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -9704,15 +9739,12 @@ define void @s_shuffle_v3i64_v4i64__7_1_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s0
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s10, s2
-; GFX942-NEXT:    s_mov_b32 s11, s3
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -9770,12 +9802,9 @@ define void @s_shuffle_v3i64_v4i64__7_2_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -9833,12 +9862,9 @@ define void @s_shuffle_v3i64_v4i64__7_3_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -9897,15 +9923,12 @@ define void @s_shuffle_v3i64_v4i64__7_4_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s0
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -9963,10 +9986,8 @@ define void @s_shuffle_v3i64_v4i64__7_5_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -10028,12 +10049,9 @@ define void @s_shuffle_v3i64_v4i64__7_6_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -10046,137 +10064,12 @@ define void @s_shuffle_v3i64_v4i64__7_6_0() {
 }
 
 define void @s_shuffle_v3i64_v4i64__u_1_1() {
-; GFX9-LABEL: s_shuffle_v3i64_v4i64__u_1_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> <i32 poison, i32 1, i32 1>
-  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v3i64_v4i64__0_1_1() {
-; GFX9-LABEL: s_shuffle_v3i64_v4i64__0_1_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> <i32 0, i32 1, i32 1>
-  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v3i64_v4i64__1_1_1() {
-; GFX9-LABEL: s_shuffle_v3i64_v4i64__1_1_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> <i32 1, i32 1, i32 1>
-  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v3i64_v4i64__2_1_1() {
-; GFX9-LABEL: s_shuffle_v3i64_v4i64__2_1_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s12
-; GFX9-NEXT:    s_mov_b32 s9, s13
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> <i32 2, i32 1, i32 1>
-  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v3i64_v4i64__3_1_1() {
-; GFX9-LABEL: s_shuffle_v3i64_v4i64__3_1_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> <i32 3, i32 1, i32 1>
-  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v3i64_v4i64__4_1_1() {
-; GFX9-LABEL: s_shuffle_v3i64_v4i64__4_1_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> <i32 4, i32 1, i32 1>
-  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v3i64_v4i64__5_1_1() {
-; GFX900-LABEL: s_shuffle_v3i64_v4i64__5_1_1:
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__u_1_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s14
-; GFX900-NEXT:    s_mov_b32 s9, s15
 ; GFX900-NEXT:    s_mov_b32 s12, s10
 ; GFX900-NEXT:    s_mov_b32 s13, s11
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -10184,17 +10077,12 @@ define void @s_shuffle_v3i64_v4i64__5_1_1() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3i64_v4i64__5_1_1:
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__u_1_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s14
-; GFX90A-NEXT:    s_mov_b32 s9, s15
 ; GFX90A-NEXT:    s_mov_b32 s12, s10
 ; GFX90A-NEXT:    s_mov_b32 s13, s11
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -10202,42 +10090,30 @@ define void @s_shuffle_v3i64_v4i64__5_1_1() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3i64_v4i64__5_1_1:
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__u_1_1:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s12, s10
-; GFX942-NEXT:    s_mov_b32 s13, s11
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 5, i32 1, i32 1>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> <i32 poison, i32 1, i32 1>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3i64_v4i64__6_1_1() {
-; GFX900-LABEL: s_shuffle_v3i64_v4i64__6_1_1:
+define void @s_shuffle_v3i64_v4i64__0_1_1() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__0_1_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s16
-; GFX900-NEXT:    s_mov_b32 s9, s17
 ; GFX900-NEXT:    s_mov_b32 s12, s10
 ; GFX900-NEXT:    s_mov_b32 s13, s11
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -10245,17 +10121,12 @@ define void @s_shuffle_v3i64_v4i64__6_1_1() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3i64_v4i64__6_1_1:
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__0_1_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s16
-; GFX90A-NEXT:    s_mov_b32 s9, s17
 ; GFX90A-NEXT:    s_mov_b32 s12, s10
 ; GFX90A-NEXT:    s_mov_b32 s13, s11
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -10263,42 +10134,32 @@ define void @s_shuffle_v3i64_v4i64__6_1_1() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3i64_v4i64__6_1_1:
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__0_1_1:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s12, s10
-; GFX942-NEXT:    s_mov_b32 s13, s11
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 6, i32 1, i32 1>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> <i32 0, i32 1, i32 1>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3i64_v4i64__7_1_1() {
-; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_1_1:
+define void @s_shuffle_v3i64_v4i64__1_1_1() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__1_1_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s18
-; GFX900-NEXT:    s_mov_b32 s9, s19
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
 ; GFX900-NEXT:    s_mov_b32 s12, s10
 ; GFX900-NEXT:    s_mov_b32 s13, s11
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -10306,17 +10167,14 @@ define void @s_shuffle_v3i64_v4i64__7_1_1() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_1_1:
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__1_1_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s18
-; GFX90A-NEXT:    s_mov_b32 s9, s19
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
 ; GFX90A-NEXT:    s_mov_b32 s12, s10
 ; GFX90A-NEXT:    s_mov_b32 s13, s11
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -10324,263 +10182,204 @@ define void @s_shuffle_v3i64_v4i64__7_1_1() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_1_1:
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__1_1_1:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s12, s10
-; GFX942-NEXT:    s_mov_b32 s13, s11
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 1, i32 1>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> <i32 1, i32 1, i32 1>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3i64_v4i64__7_u_1() {
-; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_u_1:
+define void @s_shuffle_v3i64_v4i64__2_1_1() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__2_1_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s14
-; GFX900-NEXT:    s_mov_b32 s9, s15
-; GFX900-NEXT:    s_mov_b32 s12, s6
-; GFX900-NEXT:    s_mov_b32 s13, s7
+; GFX900-NEXT:    s_mov_b32 s8, s12
+; GFX900-NEXT:    s_mov_b32 s9, s13
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_u_1:
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__2_1_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s14
-; GFX90A-NEXT:    s_mov_b32 s9, s15
-; GFX90A-NEXT:    s_mov_b32 s12, s6
-; GFX90A-NEXT:    s_mov_b32 s13, s7
+; GFX90A-NEXT:    s_mov_b32 s8, s12
+; GFX90A-NEXT:    s_mov_b32 s9, s13
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_u_1:
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__2_1_1:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[4:11]
+; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 poison, i32 1>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> <i32 2, i32 1, i32 1>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3i64_v4i64__7_0_1() {
-; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_0_1:
+define void @s_shuffle_v3i64_v4i64__3_1_1() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__3_1_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s14
 ; GFX900-NEXT:    s_mov_b32 s9, s15
-; GFX900-NEXT:    s_mov_b32 s10, s4
-; GFX900-NEXT:    s_mov_b32 s11, s5
-; GFX900-NEXT:    s_mov_b32 s12, s6
-; GFX900-NEXT:    s_mov_b32 s13, s7
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_0_1:
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__3_1_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s14
 ; GFX90A-NEXT:    s_mov_b32 s9, s15
-; GFX90A-NEXT:    s_mov_b32 s10, s4
-; GFX90A-NEXT:    s_mov_b32 s11, s5
-; GFX90A-NEXT:    s_mov_b32 s12, s6
-; GFX90A-NEXT:    s_mov_b32 s13, s7
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_0_1:
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__3_1_1:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[4:11]
+; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 0, i32 1>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> <i32 3, i32 1, i32 1>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3i64_v4i64__7_2_1() {
-; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_2_1:
+define void @s_shuffle_v3i64_v4i64__4_1_1() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__4_1_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s10, s16
-; GFX900-NEXT:    s_mov_b32 s11, s17
-; GFX900-NEXT:    s_mov_b32 s12, s14
-; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_2_1:
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__4_1_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s10, s16
-; GFX90A-NEXT:    s_mov_b32 s11, s17
-; GFX90A-NEXT:    s_mov_b32 s12, s14
-; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_2_1:
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__4_1_1:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 2, i32 1>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> <i32 4, i32 1, i32 1>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3i64_v4i64__7_3_1() {
-; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_3_1:
+define void @s_shuffle_v3i64_v4i64__5_1_1() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__5_1_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s18
-; GFX900-NEXT:    s_mov_b32 s9, s19
-; GFX900-NEXT:    s_mov_b32 s12, s6
-; GFX900-NEXT:    s_mov_b32 s13, s7
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_3_1:
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__5_1_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s18
-; GFX90A-NEXT:    s_mov_b32 s9, s19
-; GFX90A-NEXT:    s_mov_b32 s12, s6
-; GFX90A-NEXT:    s_mov_b32 s13, s7
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_3_1:
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__5_1_1:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -10589,128 +10388,116 @@ define void @s_shuffle_v3i64_v4i64__7_3_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 3, i32 1>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 5, i32 1, i32 1>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3i64_v4i64__7_4_1() {
-; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_4_1:
+define void @s_shuffle_v3i64_v4i64__6_1_1() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__6_1_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s18
-; GFX900-NEXT:    s_mov_b32 s9, s19
-; GFX900-NEXT:    s_mov_b32 s10, s12
-; GFX900-NEXT:    s_mov_b32 s11, s13
-; GFX900-NEXT:    s_mov_b32 s12, s6
-; GFX900-NEXT:    s_mov_b32 s13, s7
+; GFX900-NEXT:    s_mov_b32 s8, s16
+; GFX900-NEXT:    s_mov_b32 s9, s17
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_4_1:
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__6_1_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s18
-; GFX90A-NEXT:    s_mov_b32 s9, s19
-; GFX90A-NEXT:    s_mov_b32 s10, s12
-; GFX90A-NEXT:    s_mov_b32 s11, s13
-; GFX90A-NEXT:    s_mov_b32 s12, s6
-; GFX90A-NEXT:    s_mov_b32 s13, s7
+; GFX90A-NEXT:    s_mov_b32 s8, s16
+; GFX90A-NEXT:    s_mov_b32 s9, s17
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_4_1:
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__6_1_1:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s2
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[4:11]
+; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 4, i32 1>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 6, i32 1, i32 1>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3i64_v4i64__7_5_1() {
-; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_5_1:
+define void @s_shuffle_v3i64_v4i64__7_1_1() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_1_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s14
-; GFX900-NEXT:    s_mov_b32 s9, s15
-; GFX900-NEXT:    s_mov_b32 s12, s6
-; GFX900-NEXT:    s_mov_b32 s13, s7
+; GFX900-NEXT:    s_mov_b32 s8, s18
+; GFX900-NEXT:    s_mov_b32 s9, s19
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_5_1:
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_1_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s14
-; GFX90A-NEXT:    s_mov_b32 s9, s15
-; GFX90A-NEXT:    s_mov_b32 s12, s6
-; GFX90A-NEXT:    s_mov_b32 s13, s7
+; GFX90A-NEXT:    s_mov_b32 s8, s18
+; GFX90A-NEXT:    s_mov_b32 s9, s19
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_5_1:
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_1_1:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -10719,23 +10506,21 @@ define void @s_shuffle_v3i64_v4i64__7_5_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 5, i32 1>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 1, i32 1>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3i64_v4i64__7_6_1() {
-; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_6_1:
+define void @s_shuffle_v3i64_v4i64__7_u_1() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_u_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -10746,8 +10531,6 @@ define void @s_shuffle_v3i64_v4i64__7_6_1() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s14
 ; GFX900-NEXT:    s_mov_b32 s9, s15
-; GFX900-NEXT:    s_mov_b32 s10, s12
-; GFX900-NEXT:    s_mov_b32 s11, s13
 ; GFX900-NEXT:    s_mov_b32 s12, s6
 ; GFX900-NEXT:    s_mov_b32 s13, s7
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -10755,7 +10538,7 @@ define void @s_shuffle_v3i64_v4i64__7_6_1() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_6_1:
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_u_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -10766,8 +10549,6 @@ define void @s_shuffle_v3i64_v4i64__7_6_1() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s14
 ; GFX90A-NEXT:    s_mov_b32 s9, s15
-; GFX90A-NEXT:    s_mov_b32 s10, s12
-; GFX90A-NEXT:    s_mov_b32 s11, s13
 ; GFX90A-NEXT:    s_mov_b32 s12, s6
 ; GFX90A-NEXT:    s_mov_b32 s13, s7
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -10775,190 +10556,134 @@ define void @s_shuffle_v3i64_v4i64__7_6_1() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_6_1:
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_u_1:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 6, i32 1>
-  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v3i64_v4i64__u_2_2() {
-; GFX9-LABEL: s_shuffle_v3i64_v4i64__u_2_2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> <i32 poison, i32 2, i32 2>
-  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v3i64_v4i64__0_2_2() {
-; GFX9-LABEL: s_shuffle_v3i64_v4i64__0_2_2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> <i32 0, i32 2, i32 2>
-  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v3i64_v4i64__1_2_2() {
-; GFX9-LABEL: s_shuffle_v3i64_v4i64__1_2_2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> <i32 1, i32 2, i32 2>
-  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v3i64_v4i64__2_2_2() {
-; GFX9-LABEL: s_shuffle_v3i64_v4i64__2_2_2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s12
-; GFX9-NEXT:    s_mov_b32 s9, s13
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> <i32 2, i32 2, i32 2>
-  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v3i64_v4i64__3_2_2() {
-; GFX9-LABEL: s_shuffle_v3i64_v4i64__3_2_2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> <i32 3, i32 2, i32 2>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 poison, i32 1>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3i64_v4i64__4_2_2() {
-; GFX9-LABEL: s_shuffle_v3i64_v4i64__4_2_2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+define void @s_shuffle_v3i64_v4i64__7_0_1() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_0_1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s10, s4
+; GFX900-NEXT:    s_mov_b32 s11, s5
+; GFX900-NEXT:    s_mov_b32 s12, s6
+; GFX900-NEXT:    s_mov_b32 s13, s7
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_0_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s10, s4
+; GFX90A-NEXT:    s_mov_b32 s11, s5
+; GFX90A-NEXT:    s_mov_b32 s12, s6
+; GFX90A-NEXT:    s_mov_b32 s13, s7
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_0_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[4:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> <i32 4, i32 2, i32 2>
+  %vec1 = call <4 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 0, i32 1>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3i64_v4i64__5_2_2() {
-; GFX900-LABEL: s_shuffle_v3i64_v4i64__5_2_2:
+define void @s_shuffle_v3i64_v4i64__7_2_1() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_2_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s6
-; GFX900-NEXT:    s_mov_b32 s9, s7
-; GFX900-NEXT:    s_mov_b32 s10, s12
-; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s10, s16
+; GFX900-NEXT:    s_mov_b32 s11, s17
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3i64_v4i64__5_2_2:
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_2_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s6
-; GFX90A-NEXT:    s_mov_b32 s9, s7
-; GFX90A-NEXT:    s_mov_b32 s10, s12
-; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s10, s16
+; GFX90A-NEXT:    s_mov_b32 s11, s17
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3i64_v4i64__5_2_2:
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_2_1:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -10967,55 +10692,58 @@ define void @s_shuffle_v3i64_v4i64__5_2_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 5, i32 2, i32 2>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 2, i32 1>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3i64_v4i64__6_2_2() {
-; GFX900-LABEL: s_shuffle_v3i64_v4i64__6_2_2:
+define void @s_shuffle_v3i64_v4i64__7_3_1() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_3_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s10, s12
-; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s8, s18
+; GFX900-NEXT:    s_mov_b32 s9, s19
+; GFX900-NEXT:    s_mov_b32 s12, s6
+; GFX900-NEXT:    s_mov_b32 s13, s7
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3i64_v4i64__6_2_2:
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_3_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s10, s12
-; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s8, s18
+; GFX90A-NEXT:    s_mov_b32 s9, s19
+; GFX90A-NEXT:    s_mov_b32 s12, s6
+; GFX90A-NEXT:    s_mov_b32 s13, s7
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3i64_v4i64__6_2_2:
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_3_1:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -11024,238 +10752,1418 @@ define void @s_shuffle_v3i64_v4i64__6_2_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 6, i32 2, i32 2>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 3, i32 1>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3i64_v4i64__7_2_2() {
-; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_2_2:
+define void @s_shuffle_v3i64_v4i64__7_4_1() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_4_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s8, s18
+; GFX900-NEXT:    s_mov_b32 s9, s19
 ; GFX900-NEXT:    s_mov_b32 s10, s12
 ; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s12, s6
+; GFX900-NEXT:    s_mov_b32 s13, s7
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_2_2:
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_4_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s8, s18
+; GFX90A-NEXT:    s_mov_b32 s9, s19
 ; GFX90A-NEXT:    s_mov_b32 s10, s12
 ; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s12, s6
+; GFX90A-NEXT:    s_mov_b32 s13, s7
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_4_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[4:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x i64> asm "; def $0", "=s"()
+  %vec1 = call <4 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 4, i32 1>
+  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v3i64_v4i64__7_5_1() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_5_1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s12, s6
+; GFX900-NEXT:    s_mov_b32 s13, s7
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_5_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s6
+; GFX90A-NEXT:    s_mov_b32 s13, s7
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_5_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x i64> asm "; def $0", "=s"()
+  %vec1 = call <4 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 5, i32 1>
+  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v3i64_v4i64__7_6_1() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_6_1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s12, s6
+; GFX900-NEXT:    s_mov_b32 s13, s7
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_6_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s12, s6
+; GFX90A-NEXT:    s_mov_b32 s13, s7
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_6_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x i64> asm "; def $0", "=s"()
+  %vec1 = call <4 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 6, i32 1>
+  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v3i64_v4i64__u_2_2() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__u_2_2:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__u_2_2:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__u_2_2:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> <i32 poison, i32 2, i32 2>
+  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v3i64_v4i64__0_2_2() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__0_2_2:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__0_2_2:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__0_2_2:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> <i32 0, i32 2, i32 2>
+  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v3i64_v4i64__1_2_2() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__1_2_2:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__1_2_2:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__1_2_2:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> <i32 1, i32 2, i32 2>
+  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v3i64_v4i64__2_2_2() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__2_2_2:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s12
+; GFX900-NEXT:    s_mov_b32 s9, s13
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__2_2_2:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s12
+; GFX90A-NEXT:    s_mov_b32 s9, s13
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__2_2_2:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> <i32 2, i32 2, i32 2>
+  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v3i64_v4i64__3_2_2() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__3_2_2:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__3_2_2:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__3_2_2:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> <i32 3, i32 2, i32 2>
+  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v3i64_v4i64__4_2_2() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__4_2_2:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__4_2_2:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__4_2_2:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> <i32 4, i32 2, i32 2>
+  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v3i64_v4i64__5_2_2() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__5_2_2:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s6
+; GFX900-NEXT:    s_mov_b32 s9, s7
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__5_2_2:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s6
+; GFX90A-NEXT:    s_mov_b32 s9, s7
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__5_2_2:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x i64> asm "; def $0", "=s"()
+  %vec1 = call <4 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 5, i32 2, i32 2>
+  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v3i64_v4i64__6_2_2() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__6_2_2:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__6_2_2:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__6_2_2:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x i64> asm "; def $0", "=s"()
+  %vec1 = call <4 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 6, i32 2, i32 2>
+  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v3i64_v4i64__7_2_2() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_2_2:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_2_2:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_2_2:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x i64> asm "; def $0", "=s"()
+  %vec1 = call <4 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 2, i32 2>
+  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v3i64_v4i64__7_u_2() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_u_2:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_u_2:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_u_2:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x i64> asm "; def $0", "=s"()
+  %vec1 = call <4 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 poison, i32 2>
+  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v3i64_v4i64__7_0_2() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_0_2:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s12, s16
+; GFX900-NEXT:    s_mov_b32 s13, s17
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_0_2:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s12, s16
+; GFX90A-NEXT:    s_mov_b32 s13, s17
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_0_2:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x i64> asm "; def $0", "=s"()
+  %vec1 = call <4 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 0, i32 2>
+  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v3i64_v4i64__7_1_2() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_1_2:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[16:23]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s22
+; GFX900-NEXT:    s_mov_b32 s9, s23
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_1_2:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[16:23]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s22
+; GFX90A-NEXT:    s_mov_b32 s9, s23
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_1_2:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x i64> asm "; def $0", "=s"()
+  %vec1 = call <4 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 1, i32 2>
+  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v3i64_v4i64__7_3_2() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_3_2:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_3_2:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_3_2:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x i64> asm "; def $0", "=s"()
+  %vec1 = call <4 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 3, i32 2>
+  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v3i64_v4i64__7_4_2() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_4_2:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s10, s4
+; GFX900-NEXT:    s_mov_b32 s11, s5
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_4_2:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s10, s4
+; GFX90A-NEXT:    s_mov_b32 s11, s5
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_4_2:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x i64> asm "; def $0", "=s"()
+  %vec1 = call <4 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 4, i32 2>
+  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v3i64_v4i64__7_5_2() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_5_2:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s12, s16
+; GFX900-NEXT:    s_mov_b32 s13, s17
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_5_2:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s16
+; GFX90A-NEXT:    s_mov_b32 s13, s17
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_5_2:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x i64> asm "; def $0", "=s"()
+  %vec1 = call <4 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 5, i32 2>
+  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v3i64_v4i64__7_6_2() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_6_2:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[16:23]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s22
+; GFX900-NEXT:    s_mov_b32 s9, s23
+; GFX900-NEXT:    s_mov_b32 s10, s20
+; GFX900-NEXT:    s_mov_b32 s11, s21
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_6_2:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[16:23]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s22
+; GFX90A-NEXT:    s_mov_b32 s9, s23
+; GFX90A-NEXT:    s_mov_b32 s10, s20
+; GFX90A-NEXT:    s_mov_b32 s11, s21
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_6_2:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x i64> asm "; def $0", "=s"()
+  %vec1 = call <4 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 6, i32 2>
+  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v3i64_v4i64__u_3_3() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__u_3_3:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__u_3_3:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__u_3_3:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> <i32 poison, i32 3, i32 3>
+  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v3i64_v4i64__0_3_3() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__0_3_3:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__0_3_3:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__0_3_3:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> <i32 0, i32 3, i32 3>
+  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v3i64_v4i64__1_3_3() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__1_3_3:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__1_3_3:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__1_3_3:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> <i32 1, i32 3, i32 3>
+  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v3i64_v4i64__2_3_3() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__2_3_3:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s12
+; GFX900-NEXT:    s_mov_b32 s9, s13
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__2_3_3:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s12
+; GFX90A-NEXT:    s_mov_b32 s9, s13
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_2_2:
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__2_3_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[8:15]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 2, i32 2>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> <i32 2, i32 3, i32 3>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3i64_v4i64__7_u_2() {
-; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_u_2:
+define void @s_shuffle_v3i64_v4i64__3_3_3() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__3_3_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_u_2:
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__3_3_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_u_2:
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__3_3_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[8:15]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 poison, i32 2>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> <i32 3, i32 3, i32 3>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3i64_v4i64__7_0_2() {
-; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_0_2:
+define void @s_shuffle_v3i64_v4i64__4_3_3() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__4_3_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s10, s12
-; GFX900-NEXT:    s_mov_b32 s11, s13
-; GFX900-NEXT:    s_mov_b32 s12, s16
-; GFX900-NEXT:    s_mov_b32 s13, s17
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_0_2:
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__4_3_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s10, s12
-; GFX90A-NEXT:    s_mov_b32 s11, s13
-; GFX90A-NEXT:    s_mov_b32 s12, s16
-; GFX90A-NEXT:    s_mov_b32 s13, s17
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_0_2:
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__4_3_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[8:15]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s4
-; GFX942-NEXT:    s_mov_b32 s13, s5
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 0, i32 2>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> <i32 4, i32 3, i32 3>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3i64_v4i64__7_1_2() {
-; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_1_2:
+define void @s_shuffle_v3i64_v4i64__5_3_3() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__5_3_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[16:23]
+; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s22
-; GFX900-NEXT:    s_mov_b32 s9, s23
+; GFX900-NEXT:    s_mov_b32 s8, s6
+; GFX900-NEXT:    s_mov_b32 s9, s7
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_1_2:
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__5_3_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[16:23]
+; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s22
-; GFX90A-NEXT:    s_mov_b32 s9, s23
+; GFX90A-NEXT:    s_mov_b32 s8, s6
+; GFX90A-NEXT:    s_mov_b32 s9, s7
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_1_2:
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__5_3_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -11264,21 +12172,22 @@ define void @s_shuffle_v3i64_v4i64__7_1_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 1, i32 2>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 5, i32 3, i32 3>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3i64_v4i64__7_3_2() {
-; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_3_2:
+define void @s_shuffle_v3i64_v4i64__6_3_3() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__6_3_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -11287,16 +12196,16 @@ define void @s_shuffle_v3i64_v4i64__7_3_2() {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
 ; GFX900-NEXT:    s_mov_b32 s10, s14
 ; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_3_2:
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__6_3_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -11305,16 +12214,16 @@ define void @s_shuffle_v3i64_v4i64__7_3_2() {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
 ; GFX90A-NEXT:    s_mov_b32 s10, s14
 ; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_3_2:
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__6_3_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -11323,23 +12232,22 @@ define void @s_shuffle_v3i64_v4i64__7_3_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 3, i32 2>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 6, i32 3, i32 3>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3i64_v4i64__7_4_2() {
-; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_4_2:
+define void @s_shuffle_v3i64_v4i64__7_3_3() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_3_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -11350,14 +12258,16 @@ define void @s_shuffle_v3i64_v4i64__7_4_2() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s10
 ; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s10, s4
-; GFX900-NEXT:    s_mov_b32 s11, s5
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_4_2:
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_3_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -11368,14 +12278,16 @@ define void @s_shuffle_v3i64_v4i64__7_4_2() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s10
 ; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s10, s4
-; GFX90A-NEXT:    s_mov_b32 s11, s5
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_4_2:
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_3_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -11384,59 +12296,58 @@ define void @s_shuffle_v3i64_v4i64__7_4_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 4, i32 2>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 3, i32 3>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3i64_v4i64__7_5_2() {
-; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_5_2:
+define void @s_shuffle_v3i64_v4i64__7_u_3() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_u_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s14
-; GFX900-NEXT:    s_mov_b32 s9, s15
-; GFX900-NEXT:    s_mov_b32 s12, s16
-; GFX900-NEXT:    s_mov_b32 s13, s17
+; GFX900-NEXT:    s_mov_b32 s8, s18
+; GFX900-NEXT:    s_mov_b32 s9, s19
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_5_2:
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_u_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s14
-; GFX90A-NEXT:    s_mov_b32 s9, s15
-; GFX90A-NEXT:    s_mov_b32 s12, s16
-; GFX90A-NEXT:    s_mov_b32 s13, s17
+; GFX90A-NEXT:    s_mov_b32 s8, s18
+; GFX90A-NEXT:    s_mov_b32 s9, s19
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_5_2:
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_u_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -11445,59 +12356,61 @@ define void @s_shuffle_v3i64_v4i64__7_5_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s12, s4
-; GFX942-NEXT:    s_mov_b32 s13, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 5, i32 2>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 poison, i32 3>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3i64_v4i64__7_6_2() {
-; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_6_2:
+define void @s_shuffle_v3i64_v4i64__7_0_3() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_0_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[16:23]
+; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s22
-; GFX900-NEXT:    s_mov_b32 s9, s23
-; GFX900-NEXT:    s_mov_b32 s10, s20
-; GFX900-NEXT:    s_mov_b32 s11, s21
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s12, s18
+; GFX900-NEXT:    s_mov_b32 s13, s19
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_6_2:
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_0_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[16:23]
+; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s22
-; GFX90A-NEXT:    s_mov_b32 s9, s23
-; GFX90A-NEXT:    s_mov_b32 s10, s20
-; GFX90A-NEXT:    s_mov_b32 s11, s21
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s12, s18
+; GFX90A-NEXT:    s_mov_b32 s13, s19
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_6_2:
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_0_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -11506,30 +12419,32 @@ define void @s_shuffle_v3i64_v4i64__7_6_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 6, i32 2>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 0, i32 3>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3i64_v4i64__u_3_3() {
-; GFX900-LABEL: s_shuffle_v3i64_v4i64__u_3_3:
+define void @s_shuffle_v3i64_v4i64__7_1_3() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_1_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s10, s14
-; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[16:23]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s22
+; GFX900-NEXT:    s_mov_b32 s9, s23
 ; GFX900-NEXT:    s_mov_b32 s12, s14
 ; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -11537,14 +12452,17 @@ define void @s_shuffle_v3i64_v4i64__u_3_3() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3i64_v4i64__u_3_3:
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_1_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s10, s14
-; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[16:23]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s22
+; GFX90A-NEXT:    s_mov_b32 s9, s23
 ; GFX90A-NEXT:    s_mov_b32 s12, s14
 ; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -11552,58 +12470,42 @@ define void @s_shuffle_v3i64_v4i64__u_3_3() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3i64_v4i64__u_3_3:
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_1_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> <i32 poison, i32 3, i32 3>
-  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v3i64_v4i64__0_3_3() {
-; GFX9-LABEL: s_shuffle_v3i64_v4i64__0_3_3:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s14
-; GFX9-NEXT:    s_mov_b32 s11, s15
-; GFX9-NEXT:    s_mov_b32 s12, s14
-; GFX9-NEXT:    s_mov_b32 s13, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> <i32 0, i32 3, i32 3>
+  %vec1 = call <4 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 1, i32 3>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3i64_v4i64__1_3_3() {
-; GFX900-LABEL: s_shuffle_v3i64_v4i64__1_3_3:
+define void @s_shuffle_v3i64_v4i64__7_2_3() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_2_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s10
 ; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s10, s14
-; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
 ; GFX900-NEXT:    s_mov_b32 s12, s14
 ; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -11611,16 +12513,19 @@ define void @s_shuffle_v3i64_v4i64__1_3_3() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3i64_v4i64__1_3_3:
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_2_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s10
 ; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s10, s14
-; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
 ; GFX90A-NEXT:    s_mov_b32 s12, s14
 ; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -11628,39 +12533,43 @@ define void @s_shuffle_v3i64_v4i64__1_3_3() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3i64_v4i64__1_3_3:
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_2_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> <i32 1, i32 3, i32 3>
+  %vec1 = call <4 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 2, i32 3>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3i64_v4i64__2_3_3() {
-; GFX900-LABEL: s_shuffle_v3i64_v4i64__2_3_3:
+define void @s_shuffle_v3i64_v4i64__7_4_3() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_4_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s12
-; GFX900-NEXT:    s_mov_b32 s9, s13
-; GFX900-NEXT:    s_mov_b32 s10, s14
-; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s10, s4
+; GFX900-NEXT:    s_mov_b32 s11, s5
 ; GFX900-NEXT:    s_mov_b32 s12, s14
 ; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -11668,16 +12577,19 @@ define void @s_shuffle_v3i64_v4i64__2_3_3() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3i64_v4i64__2_3_3:
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_4_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s12
-; GFX90A-NEXT:    s_mov_b32 s9, s13
-; GFX90A-NEXT:    s_mov_b32 s10, s14
-; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s10, s4
+; GFX90A-NEXT:    s_mov_b32 s11, s5
 ; GFX90A-NEXT:    s_mov_b32 s12, s14
 ; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -11685,94 +12597,102 @@ define void @s_shuffle_v3i64_v4i64__2_3_3() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3i64_v4i64__2_3_3:
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_4_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[12:19]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[18:19]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> <i32 2, i32 3, i32 3>
+  %vec1 = call <4 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 4, i32 3>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3i64_v4i64__3_3_3() {
-; GFX900-LABEL: s_shuffle_v3i64_v4i64__3_3_3:
+define void @s_shuffle_v3i64_v4i64__7_5_3() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_5_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s14
 ; GFX900-NEXT:    s_mov_b32 s9, s15
-; GFX900-NEXT:    s_mov_b32 s10, s14
-; GFX900-NEXT:    s_mov_b32 s11, s15
-; GFX900-NEXT:    s_mov_b32 s12, s14
-; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    s_mov_b32 s12, s18
+; GFX900-NEXT:    s_mov_b32 s13, s19
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3i64_v4i64__3_3_3:
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_5_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s14
 ; GFX90A-NEXT:    s_mov_b32 s9, s15
-; GFX90A-NEXT:    s_mov_b32 s10, s14
-; GFX90A-NEXT:    s_mov_b32 s11, s15
-; GFX90A-NEXT:    s_mov_b32 s12, s14
-; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s18
+; GFX90A-NEXT:    s_mov_b32 s13, s19
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3i64_v4i64__3_3_3:
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_5_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> <i32 3, i32 3, i32 3>
+  %vec1 = call <4 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 5, i32 3>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3i64_v4i64__4_3_3() {
-; GFX900-LABEL: s_shuffle_v3i64_v4i64__4_3_3:
+define void @s_shuffle_v3i64_v4i64__7_6_3() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_6_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s10, s14
-; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[16:23]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s22
+; GFX900-NEXT:    s_mov_b32 s9, s23
+; GFX900-NEXT:    s_mov_b32 s10, s20
+; GFX900-NEXT:    s_mov_b32 s11, s21
 ; GFX900-NEXT:    s_mov_b32 s12, s14
 ; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -11780,14 +12700,19 @@ define void @s_shuffle_v3i64_v4i64__4_3_3() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3i64_v4i64__4_3_3:
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_6_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s10, s14
-; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[16:23]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s22
+; GFX90A-NEXT:    s_mov_b32 s9, s23
+; GFX90A-NEXT:    s_mov_b32 s10, s20
+; GFX90A-NEXT:    s_mov_b32 s11, s21
 ; GFX90A-NEXT:    s_mov_b32 s12, s14
 ; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -11795,649 +12720,531 @@ define void @s_shuffle_v3i64_v4i64__4_3_3() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3i64_v4i64__4_3_3:
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_6_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> <i32 4, i32 3, i32 3>
+  %vec1 = call <4 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 6, i32 3>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3i64_v4i64__5_3_3() {
-; GFX900-LABEL: s_shuffle_v3i64_v4i64__5_3_3:
+define void @s_shuffle_v3i64_v4i64__u_4_4() {
+; GFX9-LABEL: s_shuffle_v3i64_v4i64__u_4_4:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; use s[8:13]
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> <i32 poison, i32 4, i32 4>
+  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v3i64_v4i64__0_4_4() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__0_4_4:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s6
-; GFX900-NEXT:    s_mov_b32 s9, s7
-; GFX900-NEXT:    s_mov_b32 s10, s14
-; GFX900-NEXT:    s_mov_b32 s11, s15
-; GFX900-NEXT:    s_mov_b32 s12, s14
-; GFX900-NEXT:    s_mov_b32 s13, s15
-; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3i64_v4i64__5_3_3:
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__0_4_4:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s6
-; GFX90A-NEXT:    s_mov_b32 s9, s7
-; GFX90A-NEXT:    s_mov_b32 s10, s14
-; GFX90A-NEXT:    s_mov_b32 s11, s15
-; GFX90A-NEXT:    s_mov_b32 s12, s14
-; GFX90A-NEXT:    s_mov_b32 s13, s15
-; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3i64_v4i64__5_3_3:
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__0_4_4:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
+; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 5, i32 3, i32 3>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> <i32 0, i32 4, i32 4>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3i64_v4i64__6_3_3() {
-; GFX900-LABEL: s_shuffle_v3i64_v4i64__6_3_3:
+define void @s_shuffle_v3i64_v4i64__1_4_4() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__1_4_4:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s10, s14
-; GFX900-NEXT:    s_mov_b32 s11, s15
-; GFX900-NEXT:    s_mov_b32 s12, s14
-; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    s_mov_b32 s8, s6
+; GFX900-NEXT:    s_mov_b32 s9, s7
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3i64_v4i64__6_3_3:
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__1_4_4:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s10, s14
-; GFX90A-NEXT:    s_mov_b32 s11, s15
-; GFX90A-NEXT:    s_mov_b32 s12, s14
-; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    s_mov_b32 s8, s6
+; GFX90A-NEXT:    s_mov_b32 s9, s7
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3i64_v4i64__6_3_3:
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__1_4_4:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[8:15]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 6, i32 3, i32 3>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> <i32 1, i32 4, i32 4>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3i64_v4i64__7_3_3() {
-; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_3_3:
+define void @s_shuffle_v3i64_v4i64__2_4_4() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__2_4_4:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s10, s14
-; GFX900-NEXT:    s_mov_b32 s11, s15
-; GFX900-NEXT:    s_mov_b32 s12, s14
-; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_3_3:
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__2_4_4:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s10, s14
-; GFX90A-NEXT:    s_mov_b32 s11, s15
-; GFX90A-NEXT:    s_mov_b32 s12, s14
-; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_3_3:
-; GFX942:       ; %bb.0:
-; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[8:15]
-; GFX942-NEXT:    ;;#ASMEND
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__2_4_4:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 3, i32 3>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> <i32 2, i32 4, i32 4>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3i64_v4i64__7_u_3() {
-; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_u_3:
+define void @s_shuffle_v3i64_v4i64__3_4_4() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__3_4_4:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s18
-; GFX900-NEXT:    s_mov_b32 s9, s19
-; GFX900-NEXT:    s_mov_b32 s12, s10
-; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_u_3:
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__3_4_4:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s18
-; GFX90A-NEXT:    s_mov_b32 s9, s19
-; GFX90A-NEXT:    s_mov_b32 s12, s10
-; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_u_3:
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__3_4_4:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[8:15]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 poison, i32 3>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> <i32 3, i32 4, i32 4>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3i64_v4i64__7_0_3() {
-; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_0_3:
+define void @s_shuffle_v3i64_v4i64__4_4_4() {
+; GFX9-LABEL: s_shuffle_v3i64_v4i64__4_4_4:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; use s[8:13]
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> <i32 4, i32 4, i32 4>
+  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v3i64_v4i64__5_4_4() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__5_4_4:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s10, s12
-; GFX900-NEXT:    s_mov_b32 s11, s13
-; GFX900-NEXT:    s_mov_b32 s12, s18
-; GFX900-NEXT:    s_mov_b32 s13, s19
+; GFX900-NEXT:    s_mov_b32 s8, s6
+; GFX900-NEXT:    s_mov_b32 s9, s7
+; GFX900-NEXT:    s_mov_b32 s10, s4
+; GFX900-NEXT:    s_mov_b32 s11, s5
+; GFX900-NEXT:    s_mov_b32 s12, s4
+; GFX900-NEXT:    s_mov_b32 s13, s5
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_0_3:
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__5_4_4:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s10, s12
-; GFX90A-NEXT:    s_mov_b32 s11, s13
-; GFX90A-NEXT:    s_mov_b32 s12, s18
-; GFX90A-NEXT:    s_mov_b32 s13, s19
+; GFX90A-NEXT:    s_mov_b32 s8, s6
+; GFX90A-NEXT:    s_mov_b32 s9, s7
+; GFX90A-NEXT:    s_mov_b32 s10, s4
+; GFX90A-NEXT:    s_mov_b32 s11, s5
+; GFX90A-NEXT:    s_mov_b32 s12, s4
+; GFX90A-NEXT:    s_mov_b32 s13, s5
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_0_3:
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__5_4_4:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[8:15]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 0, i32 3>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 5, i32 4, i32 4>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3i64_v4i64__7_1_3() {
-; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_1_3:
+define void @s_shuffle_v3i64_v4i64__6_4_4() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__6_4_4:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[16:23]
+; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s22
-; GFX900-NEXT:    s_mov_b32 s9, s23
-; GFX900-NEXT:    s_mov_b32 s12, s14
-; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    s_mov_b32 s10, s4
+; GFX900-NEXT:    s_mov_b32 s11, s5
+; GFX900-NEXT:    s_mov_b32 s12, s4
+; GFX900-NEXT:    s_mov_b32 s13, s5
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_1_3:
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__6_4_4:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[16:23]
+; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s22
-; GFX90A-NEXT:    s_mov_b32 s9, s23
-; GFX90A-NEXT:    s_mov_b32 s12, s14
-; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    s_mov_b32 s10, s4
+; GFX90A-NEXT:    s_mov_b32 s11, s5
+; GFX90A-NEXT:    s_mov_b32 s12, s4
+; GFX90A-NEXT:    s_mov_b32 s13, s5
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_1_3:
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__6_4_4:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[8:15]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 1, i32 3>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 6, i32 4, i32 4>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3i64_v4i64__7_2_3() {
-; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_2_3:
+define void @s_shuffle_v3i64_v4i64__7_4_4() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_4_4:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s10
 ; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s10, s12
-; GFX900-NEXT:    s_mov_b32 s11, s13
-; GFX900-NEXT:    s_mov_b32 s12, s14
-; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    s_mov_b32 s10, s4
+; GFX900-NEXT:    s_mov_b32 s11, s5
+; GFX900-NEXT:    s_mov_b32 s12, s4
+; GFX900-NEXT:    s_mov_b32 s13, s5
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_2_3:
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_4_4:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s10
 ; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s10, s12
-; GFX90A-NEXT:    s_mov_b32 s11, s13
-; GFX90A-NEXT:    s_mov_b32 s12, s14
-; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    s_mov_b32 s10, s4
+; GFX90A-NEXT:    s_mov_b32 s11, s5
+; GFX90A-NEXT:    s_mov_b32 s12, s4
+; GFX90A-NEXT:    s_mov_b32 s13, s5
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_2_3:
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_4_4:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[8:15]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 2, i32 3>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 4, i32 4>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3i64_v4i64__7_4_3() {
-; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_4_3:
+define void @s_shuffle_v3i64_v4i64__7_u_4() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_u_4:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s10
 ; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s10, s4
-; GFX900-NEXT:    s_mov_b32 s11, s5
-; GFX900-NEXT:    s_mov_b32 s12, s14
-; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    s_mov_b32 s12, s4
+; GFX900-NEXT:    s_mov_b32 s13, s5
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_4_3:
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_u_4:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s10
 ; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s10, s4
-; GFX90A-NEXT:    s_mov_b32 s11, s5
-; GFX90A-NEXT:    s_mov_b32 s12, s14
-; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s4
+; GFX90A-NEXT:    s_mov_b32 s13, s5
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_4_3:
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_u_4:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[12:19]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s18
-; GFX942-NEXT:    s_mov_b32 s9, s19
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 4, i32 3>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 poison, i32 4>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3i64_v4i64__7_5_3() {
-; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_5_3:
+define void @s_shuffle_v3i64_v4i64__7_0_4() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_0_4:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s14
-; GFX900-NEXT:    s_mov_b32 s9, s15
-; GFX900-NEXT:    s_mov_b32 s12, s18
-; GFX900-NEXT:    s_mov_b32 s13, s19
+; GFX900-NEXT:    s_mov_b32 s8, s18
+; GFX900-NEXT:    s_mov_b32 s9, s19
+; GFX900-NEXT:    s_mov_b32 s10, s4
+; GFX900-NEXT:    s_mov_b32 s11, s5
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_5_3:
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_0_4:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s14
-; GFX90A-NEXT:    s_mov_b32 s9, s15
-; GFX90A-NEXT:    s_mov_b32 s12, s18
-; GFX90A-NEXT:    s_mov_b32 s13, s19
+; GFX90A-NEXT:    s_mov_b32 s8, s18
+; GFX90A-NEXT:    s_mov_b32 s9, s19
+; GFX90A-NEXT:    s_mov_b32 s10, s4
+; GFX90A-NEXT:    s_mov_b32 s11, s5
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_5_3:
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_0_4:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 5, i32 3>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 0, i32 4>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3i64_v4i64__7_6_3() {
-; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_6_3:
+define void @s_shuffle_v3i64_v4i64__7_1_4() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_1_4:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[16:23]
+; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s22
-; GFX900-NEXT:    s_mov_b32 s9, s23
-; GFX900-NEXT:    s_mov_b32 s10, s20
-; GFX900-NEXT:    s_mov_b32 s11, s21
-; GFX900-NEXT:    s_mov_b32 s12, s14
-; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    s_mov_b32 s8, s18
+; GFX900-NEXT:    s_mov_b32 s9, s19
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_6_3:
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_1_4:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[16:23]
+; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s22
-; GFX90A-NEXT:    s_mov_b32 s9, s23
-; GFX90A-NEXT:    s_mov_b32 s10, s20
-; GFX90A-NEXT:    s_mov_b32 s11, s21
-; GFX90A-NEXT:    s_mov_b32 s12, s14
-; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    s_mov_b32 s8, s18
+; GFX90A-NEXT:    s_mov_b32 s9, s19
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_6_3:
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_1_4:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -12446,831 +13253,783 @@ define void @s_shuffle_v3i64_v4i64__7_6_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 6, i32 3>
-  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v3i64_v4i64__u_4_4() {
-; GFX9-LABEL: s_shuffle_v3i64_v4i64__u_4_4:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> <i32 poison, i32 4, i32 4>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 1, i32 4>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3i64_v4i64__0_4_4() {
-; GFX900-LABEL: s_shuffle_v3i64_v4i64__0_4_4:
+define void @s_shuffle_v3i64_v4i64__7_2_4() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_2_4:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s12, s4
+; GFX900-NEXT:    s_mov_b32 s13, s5
+; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3i64_v4i64__0_4_4:
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_2_4:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s12, s4
+; GFX90A-NEXT:    s_mov_b32 s13, s5
+; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3i64_v4i64__0_4_4:
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_2_4:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[12:19]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[18:19]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> <i32 0, i32 4, i32 4>
+  %vec1 = call <4 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 2, i32 4>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3i64_v4i64__1_4_4() {
-; GFX900-LABEL: s_shuffle_v3i64_v4i64__1_4_4:
+define void @s_shuffle_v3i64_v4i64__7_3_4() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_3_4:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s6
-; GFX900-NEXT:    s_mov_b32 s9, s7
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s18
+; GFX900-NEXT:    s_mov_b32 s9, s19
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3i64_v4i64__1_4_4:
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_3_4:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s6
-; GFX90A-NEXT:    s_mov_b32 s9, s7
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s18
+; GFX90A-NEXT:    s_mov_b32 s9, s19
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3i64_v4i64__1_4_4:
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_3_4:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[12:19]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[18:19]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> <i32 1, i32 4, i32 4>
+  %vec1 = call <4 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 3, i32 4>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3i64_v4i64__2_4_4() {
-; GFX900-LABEL: s_shuffle_v3i64_v4i64__2_4_4:
+define void @s_shuffle_v3i64_v4i64__7_5_4() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_5_4:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s10, s6
+; GFX900-NEXT:    s_mov_b32 s11, s7
+; GFX900-NEXT:    s_mov_b32 s12, s4
+; GFX900-NEXT:    s_mov_b32 s13, s5
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3i64_v4i64__2_4_4:
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_5_4:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s10, s6
+; GFX90A-NEXT:    s_mov_b32 s11, s7
+; GFX90A-NEXT:    s_mov_b32 s12, s4
+; GFX90A-NEXT:    s_mov_b32 s13, s5
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3i64_v4i64__2_4_4:
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_5_4:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> <i32 2, i32 4, i32 4>
+  %vec1 = call <4 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 5, i32 4>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3i64_v4i64__3_4_4() {
-; GFX900-LABEL: s_shuffle_v3i64_v4i64__3_4_4:
+define void @s_shuffle_v3i64_v4i64__7_6_4() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_6_4:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s8, s18
+; GFX900-NEXT:    s_mov_b32 s9, s19
+; GFX900-NEXT:    s_mov_b32 s10, s16
+; GFX900-NEXT:    s_mov_b32 s11, s17
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3i64_v4i64__3_4_4:
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_6_4:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s8, s18
+; GFX90A-NEXT:    s_mov_b32 s9, s19
+; GFX90A-NEXT:    s_mov_b32 s10, s16
+; GFX90A-NEXT:    s_mov_b32 s11, s17
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3i64_v4i64__3_4_4:
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_6_4:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> <i32 3, i32 4, i32 4>
-  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v3i64_v4i64__4_4_4() {
-; GFX9-LABEL: s_shuffle_v3i64_v4i64__4_4_4:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> <i32 4, i32 4, i32 4>
+  %vec1 = call <4 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 6, i32 4>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3i64_v4i64__5_4_4() {
-; GFX900-LABEL: s_shuffle_v3i64_v4i64__5_4_4:
+define void @s_shuffle_v3i64_v4i64__u_5_5() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__u_5_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s6
-; GFX900-NEXT:    s_mov_b32 s9, s7
-; GFX900-NEXT:    s_mov_b32 s10, s4
-; GFX900-NEXT:    s_mov_b32 s11, s5
-; GFX900-NEXT:    s_mov_b32 s12, s4
-; GFX900-NEXT:    s_mov_b32 s13, s5
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3i64_v4i64__5_4_4:
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__u_5_5:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s6
-; GFX90A-NEXT:    s_mov_b32 s9, s7
-; GFX90A-NEXT:    s_mov_b32 s10, s4
-; GFX90A-NEXT:    s_mov_b32 s11, s5
-; GFX90A-NEXT:    s_mov_b32 s12, s4
-; GFX90A-NEXT:    s_mov_b32 s13, s5
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3i64_v4i64__5_4_4:
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__u_5_5:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 5, i32 4, i32 4>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 poison, i32 5, i32 5>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3i64_v4i64__6_4_4() {
-; GFX900-LABEL: s_shuffle_v3i64_v4i64__6_4_4:
+define void @s_shuffle_v3i64_v4i64__0_5_5() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__0_5_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s10, s4
-; GFX900-NEXT:    s_mov_b32 s11, s5
-; GFX900-NEXT:    s_mov_b32 s12, s4
-; GFX900-NEXT:    s_mov_b32 s13, s5
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3i64_v4i64__6_4_4:
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__0_5_5:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s10, s4
-; GFX90A-NEXT:    s_mov_b32 s11, s5
-; GFX90A-NEXT:    s_mov_b32 s12, s4
-; GFX90A-NEXT:    s_mov_b32 s13, s5
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3i64_v4i64__6_4_4:
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__0_5_5:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 6, i32 4, i32 4>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 0, i32 5, i32 5>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3i64_v4i64__7_4_4() {
-; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_4_4:
+define void @s_shuffle_v3i64_v4i64__1_5_5() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__1_5_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s10, s4
-; GFX900-NEXT:    s_mov_b32 s11, s5
-; GFX900-NEXT:    s_mov_b32 s12, s4
-; GFX900-NEXT:    s_mov_b32 s13, s5
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s6
+; GFX900-NEXT:    s_mov_b32 s9, s7
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_4_4:
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__1_5_5:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s10, s4
-; GFX90A-NEXT:    s_mov_b32 s11, s5
-; GFX90A-NEXT:    s_mov_b32 s12, s4
-; GFX90A-NEXT:    s_mov_b32 s13, s5
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s6
+; GFX90A-NEXT:    s_mov_b32 s9, s7
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_4_4:
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__1_5_5:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 4, i32 4>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 1, i32 5, i32 5>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3i64_v4i64__7_u_4() {
-; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_u_4:
+define void @s_shuffle_v3i64_v4i64__2_5_5() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__2_5_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s12, s4
-; GFX900-NEXT:    s_mov_b32 s13, s5
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s16
+; GFX900-NEXT:    s_mov_b32 s9, s17
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_u_4:
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__2_5_5:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s12, s4
-; GFX90A-NEXT:    s_mov_b32 s13, s5
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s16
+; GFX90A-NEXT:    s_mov_b32 s9, s17
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_u_4:
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__2_5_5:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 poison, i32 4>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 2, i32 5, i32 5>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3i64_v4i64__7_0_4() {
-; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_0_4:
+define void @s_shuffle_v3i64_v4i64__3_5_5() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__3_5_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s18
 ; GFX900-NEXT:    s_mov_b32 s9, s19
-; GFX900-NEXT:    s_mov_b32 s10, s4
-; GFX900-NEXT:    s_mov_b32 s11, s5
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_0_4:
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__3_5_5:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s18
 ; GFX90A-NEXT:    s_mov_b32 s9, s19
-; GFX90A-NEXT:    s_mov_b32 s10, s4
-; GFX90A-NEXT:    s_mov_b32 s11, s5
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_0_4:
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__3_5_5:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[4:11]
+; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s4
-; GFX942-NEXT:    s_mov_b32 s13, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 0, i32 4>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 3, i32 5, i32 5>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3i64_v4i64__7_1_4() {
-; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_1_4:
+define void @s_shuffle_v3i64_v4i64__4_5_5() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__4_5_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s18
-; GFX900-NEXT:    s_mov_b32 s9, s19
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_1_4:
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__4_5_5:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s18
-; GFX90A-NEXT:    s_mov_b32 s9, s19
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_1_4:
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__4_5_5:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 1, i32 4>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 4, i32 5, i32 5>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3i64_v4i64__7_2_4() {
-; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_2_4:
+define void @s_shuffle_v3i64_v4i64__5_5_5() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__5_5_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
-; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s10
 ; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s10, s12
-; GFX900-NEXT:    s_mov_b32 s11, s13
-; GFX900-NEXT:    s_mov_b32 s12, s4
-; GFX900-NEXT:    s_mov_b32 s13, s5
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_2_4:
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__5_5_5:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
-; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s10
 ; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s10, s12
-; GFX90A-NEXT:    s_mov_b32 s11, s13
-; GFX90A-NEXT:    s_mov_b32 s12, s4
-; GFX90A-NEXT:    s_mov_b32 s13, s5
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_2_4:
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__5_5_5:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[12:19]
+; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s18
-; GFX942-NEXT:    s_mov_b32 s9, s19
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 2, i32 4>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 5, i32 5, i32 5>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3i64_v4i64__7_3_4() {
-; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_3_4:
+define void @s_shuffle_v3i64_v4i64__6_5_5() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__6_5_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s18
-; GFX900-NEXT:    s_mov_b32 s9, s19
+; GFX900-NEXT:    s_mov_b32 s8, s12
+; GFX900-NEXT:    s_mov_b32 s9, s13
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_3_4:
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__6_5_5:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s18
-; GFX90A-NEXT:    s_mov_b32 s9, s19
+; GFX90A-NEXT:    s_mov_b32 s8, s12
+; GFX90A-NEXT:    s_mov_b32 s9, s13
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_3_4:
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__6_5_5:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[12:19]
+; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s18
-; GFX942-NEXT:    s_mov_b32 s9, s19
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 3, i32 4>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 6, i32 5, i32 5>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3i64_v4i64__7_5_4() {
-; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_5_4:
+define void @s_shuffle_v3i64_v4i64__7_5_5() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_5_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s10, s6
-; GFX900-NEXT:    s_mov_b32 s11, s7
-; GFX900-NEXT:    s_mov_b32 s12, s4
-; GFX900-NEXT:    s_mov_b32 s13, s5
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_5_4:
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_5_5:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s10, s6
-; GFX90A-NEXT:    s_mov_b32 s11, s7
-; GFX90A-NEXT:    s_mov_b32 s12, s4
-; GFX90A-NEXT:    s_mov_b32 s13, s5
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_5_4:
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_5_5:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s2
-; GFX942-NEXT:    s_mov_b32 s11, s3
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 5, i32 4>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 5, i32 5>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3i64_v4i64__7_6_4() {
-; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_6_4:
+define void @s_shuffle_v3i64_v4i64__7_u_5() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_u_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s18
-; GFX900-NEXT:    s_mov_b32 s9, s19
-; GFX900-NEXT:    s_mov_b32 s10, s16
-; GFX900-NEXT:    s_mov_b32 s11, s17
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s12, s6
+; GFX900-NEXT:    s_mov_b32 s13, s7
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_6_4:
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_u_5:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s18
-; GFX90A-NEXT:    s_mov_b32 s9, s19
-; GFX90A-NEXT:    s_mov_b32 s10, s16
-; GFX90A-NEXT:    s_mov_b32 s11, s17
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s12, s6
+; GFX90A-NEXT:    s_mov_b32 s13, s7
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_6_4:
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_u_5:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 6, i32 4>
-  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v3i64_v4i64__u_5_5() {
-; GFX9-LABEL: s_shuffle_v3i64_v4i64__u_5_5:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 poison, i32 5, i32 5>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 poison, i32 5>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3i64_v4i64__0_5_5() {
-; GFX900-LABEL: s_shuffle_v3i64_v4i64__0_5_5:
+define void @s_shuffle_v3i64_v4i64__7_0_5() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_0_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s10, s14
-; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    s_mov_b32 s8, s18
+; GFX900-NEXT:    s_mov_b32 s9, s19
+; GFX900-NEXT:    s_mov_b32 s10, s4
+; GFX900-NEXT:    s_mov_b32 s11, s5
 ; GFX900-NEXT:    s_mov_b32 s12, s14
 ; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -13278,17 +14037,19 @@ define void @s_shuffle_v3i64_v4i64__0_5_5() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3i64_v4i64__0_5_5:
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_0_5:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s10, s14
-; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    s_mov_b32 s8, s18
+; GFX90A-NEXT:    s_mov_b32 s9, s19
+; GFX90A-NEXT:    s_mov_b32 s10, s4
+; GFX90A-NEXT:    s_mov_b32 s11, s5
 ; GFX90A-NEXT:    s_mov_b32 s12, s14
 ; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -13296,68 +14057,68 @@ define void @s_shuffle_v3i64_v4i64__0_5_5() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3i64_v4i64__0_5_5:
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_0_5:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s2
-; GFX942-NEXT:    s_mov_b32 s11, s3
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 0, i32 5, i32 5>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 0, i32 5>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3i64_v4i64__1_5_5() {
-; GFX900-LABEL: s_shuffle_v3i64_v4i64__1_5_5:
+define void @s_shuffle_v3i64_v4i64__7_1_5() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_1_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s6
-; GFX900-NEXT:    s_mov_b32 s9, s7
-; GFX900-NEXT:    s_mov_b32 s12, s10
-; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    s_mov_b32 s8, s18
+; GFX900-NEXT:    s_mov_b32 s9, s19
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3i64_v4i64__1_5_5:
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_1_5:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s6
-; GFX90A-NEXT:    s_mov_b32 s9, s7
-; GFX90A-NEXT:    s_mov_b32 s12, s10
-; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    s_mov_b32 s8, s18
+; GFX90A-NEXT:    s_mov_b32 s9, s19
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3i64_v4i64__1_5_5:
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_1_5:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -13366,231 +14127,145 @@ define void @s_shuffle_v3i64_v4i64__1_5_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s12, s10
-; GFX942-NEXT:    s_mov_b32 s13, s11
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 1, i32 5, i32 5>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 1, i32 5>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3i64_v4i64__2_5_5() {
-; GFX900-LABEL: s_shuffle_v3i64_v4i64__2_5_5:
+define void @s_shuffle_v3i64_v4i64__7_2_5() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_2_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s16
-; GFX900-NEXT:    s_mov_b32 s9, s17
-; GFX900-NEXT:    s_mov_b32 s12, s10
-; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s12, s6
+; GFX900-NEXT:    s_mov_b32 s13, s7
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3i64_v4i64__2_5_5:
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_2_5:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s16
-; GFX90A-NEXT:    s_mov_b32 s9, s17
-; GFX90A-NEXT:    s_mov_b32 s12, s10
-; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s12, s6
+; GFX90A-NEXT:    s_mov_b32 s13, s7
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3i64_v4i64__2_5_5:
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_2_5:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ; def s[12:19]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s12, s10
-; GFX942-NEXT:    s_mov_b32 s13, s11
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[18:19]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 2, i32 5, i32 5>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 2, i32 5>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3i64_v4i64__3_5_5() {
-; GFX900-LABEL: s_shuffle_v3i64_v4i64__3_5_5:
+define void @s_shuffle_v3i64_v4i64__7_3_5() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_3_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s18
 ; GFX900-NEXT:    s_mov_b32 s9, s19
-; GFX900-NEXT:    s_mov_b32 s12, s10
-; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3i64_v4i64__3_5_5:
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_3_5:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s18
 ; GFX90A-NEXT:    s_mov_b32 s9, s19
-; GFX90A-NEXT:    s_mov_b32 s12, s10
-; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3i64_v4i64__3_5_5:
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_3_5:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ; def s[12:19]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s12, s10
-; GFX942-NEXT:    s_mov_b32 s13, s11
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[18:19]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 3, i32 5, i32 5>
-  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v3i64_v4i64__4_5_5() {
-; GFX9-LABEL: s_shuffle_v3i64_v4i64__4_5_5:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 4, i32 5, i32 5>
-  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v3i64_v4i64__5_5_5() {
-; GFX9-LABEL: s_shuffle_v3i64_v4i64__5_5_5:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 5, i32 5, i32 5>
-  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v3i64_v4i64__6_5_5() {
-; GFX9-LABEL: s_shuffle_v3i64_v4i64__6_5_5:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s12
-; GFX9-NEXT:    s_mov_b32 s9, s13
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 6, i32 5, i32 5>
-  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v3i64_v4i64__7_5_5() {
-; GFX9-LABEL: s_shuffle_v3i64_v4i64__7_5_5:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 5, i32 5>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 3, i32 5>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3i64_v4i64__7_u_5() {
-; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_u_5:
+define void @s_shuffle_v3i64_v4i64__7_4_5() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_4_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -13598,6 +14273,8 @@ define void @s_shuffle_v3i64_v4i64__7_u_5() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s10
 ; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s10, s4
+; GFX900-NEXT:    s_mov_b32 s11, s5
 ; GFX900-NEXT:    s_mov_b32 s12, s6
 ; GFX900-NEXT:    s_mov_b32 s13, s7
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -13605,7 +14282,7 @@ define void @s_shuffle_v3i64_v4i64__7_u_5() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_u_5:
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_4_5:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -13613,6 +14290,8 @@ define void @s_shuffle_v3i64_v4i64__7_u_5() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s10
 ; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s10, s4
+; GFX90A-NEXT:    s_mov_b32 s11, s5
 ; GFX90A-NEXT:    s_mov_b32 s12, s6
 ; GFX90A-NEXT:    s_mov_b32 s13, s7
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -13620,41 +14299,37 @@ define void @s_shuffle_v3i64_v4i64__7_u_5() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_u_5:
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_4_5:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 poison, i32 5>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 4, i32 5>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3i64_v4i64__7_0_5() {
-; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_0_5:
+define void @s_shuffle_v3i64_v4i64__7_6_5() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_6_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s18
 ; GFX900-NEXT:    s_mov_b32 s9, s19
-; GFX900-NEXT:    s_mov_b32 s10, s4
-; GFX900-NEXT:    s_mov_b32 s11, s5
+; GFX900-NEXT:    s_mov_b32 s10, s16
+; GFX900-NEXT:    s_mov_b32 s11, s17
 ; GFX900-NEXT:    s_mov_b32 s12, s14
 ; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -13662,19 +14337,16 @@ define void @s_shuffle_v3i64_v4i64__7_0_5() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_0_5:
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_6_5:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s18
 ; GFX90A-NEXT:    s_mov_b32 s9, s19
-; GFX90A-NEXT:    s_mov_b32 s10, s4
-; GFX90A-NEXT:    s_mov_b32 s11, s5
+; GFX90A-NEXT:    s_mov_b32 s10, s16
+; GFX90A-NEXT:    s_mov_b32 s11, s17
 ; GFX90A-NEXT:    s_mov_b32 s12, s14
 ; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -13682,35 +14354,73 @@ define void @s_shuffle_v3i64_v4i64__7_0_5() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_0_5:
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_6_5:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x i64> asm "; def $0", "=s"()
+  %vec1 = call <4 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 6, i32 5>
+  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v3i64_v4i64__u_6_6() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__u_6_6:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__u_6_6:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__u_6_6:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[4:11]
+; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 0, i32 5>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 poison, i32 6, i32 6>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3i64_v4i64__7_1_5() {
-; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_1_5:
+define void @s_shuffle_v3i64_v4i64__0_6_6() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__0_6_6:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -13719,16 +14429,16 @@ define void @s_shuffle_v3i64_v4i64__7_1_5() {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s18
-; GFX900-NEXT:    s_mov_b32 s9, s19
-; GFX900-NEXT:    s_mov_b32 s12, s14
-; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    s_mov_b32 s10, s16
+; GFX900-NEXT:    s_mov_b32 s11, s17
+; GFX900-NEXT:    s_mov_b32 s12, s16
+; GFX900-NEXT:    s_mov_b32 s13, s17
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_1_5:
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__0_6_6:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -13737,16 +14447,16 @@ define void @s_shuffle_v3i64_v4i64__7_1_5() {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s18
-; GFX90A-NEXT:    s_mov_b32 s9, s19
-; GFX90A-NEXT:    s_mov_b32 s12, s14
-; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    s_mov_b32 s10, s16
+; GFX90A-NEXT:    s_mov_b32 s11, s17
+; GFX90A-NEXT:    s_mov_b32 s12, s16
+; GFX90A-NEXT:    s_mov_b32 s13, s17
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_1_5:
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__0_6_6:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -13755,360 +14465,300 @@ define void @s_shuffle_v3i64_v4i64__7_1_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 1, i32 5>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 0, i32 6, i32 6>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3i64_v4i64__7_2_5() {
-; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_2_5:
+define void @s_shuffle_v3i64_v4i64__1_6_6() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__1_6_6:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s8, s6
+; GFX900-NEXT:    s_mov_b32 s9, s7
 ; GFX900-NEXT:    s_mov_b32 s10, s12
 ; GFX900-NEXT:    s_mov_b32 s11, s13
-; GFX900-NEXT:    s_mov_b32 s12, s6
-; GFX900-NEXT:    s_mov_b32 s13, s7
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_2_5:
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__1_6_6:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s8, s6
+; GFX90A-NEXT:    s_mov_b32 s9, s7
 ; GFX90A-NEXT:    s_mov_b32 s10, s12
 ; GFX90A-NEXT:    s_mov_b32 s11, s13
-; GFX90A-NEXT:    s_mov_b32 s12, s6
-; GFX90A-NEXT:    s_mov_b32 s13, s7
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_2_5:
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__1_6_6:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[12:19]
+; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s18
-; GFX942-NEXT:    s_mov_b32 s9, s19
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 2, i32 5>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 1, i32 6, i32 6>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3i64_v4i64__7_3_5() {
-; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_3_5:
+define void @s_shuffle_v3i64_v4i64__2_6_6() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__2_6_6:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s18
-; GFX900-NEXT:    s_mov_b32 s9, s19
-; GFX900-NEXT:    s_mov_b32 s12, s14
-; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    s_mov_b32 s8, s16
+; GFX900-NEXT:    s_mov_b32 s9, s17
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_3_5:
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__2_6_6:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s18
-; GFX90A-NEXT:    s_mov_b32 s9, s19
-; GFX90A-NEXT:    s_mov_b32 s12, s14
-; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    s_mov_b32 s8, s16
+; GFX90A-NEXT:    s_mov_b32 s9, s17
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_3_5:
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__2_6_6:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[12:19]
+; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s18
-; GFX942-NEXT:    s_mov_b32 s9, s19
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 3, i32 5>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 2, i32 6, i32 6>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3i64_v4i64__7_4_5() {
-; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_4_5:
+define void @s_shuffle_v3i64_v4i64__3_6_6() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__3_6_6:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s10, s4
-; GFX900-NEXT:    s_mov_b32 s11, s5
-; GFX900-NEXT:    s_mov_b32 s12, s6
-; GFX900-NEXT:    s_mov_b32 s13, s7
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s18
+; GFX900-NEXT:    s_mov_b32 s9, s19
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_4_5:
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__3_6_6:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s10, s4
-; GFX90A-NEXT:    s_mov_b32 s11, s5
-; GFX90A-NEXT:    s_mov_b32 s12, s6
-; GFX90A-NEXT:    s_mov_b32 s13, s7
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s18
+; GFX90A-NEXT:    s_mov_b32 s9, s19
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_4_5:
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__3_6_6:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 4, i32 5>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 3, i32 6, i32 6>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3i64_v4i64__7_6_5() {
-; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_6_5:
+define void @s_shuffle_v3i64_v4i64__4_6_6() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__4_6_6:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s18
-; GFX900-NEXT:    s_mov_b32 s9, s19
-; GFX900-NEXT:    s_mov_b32 s10, s16
-; GFX900-NEXT:    s_mov_b32 s11, s17
-; GFX900-NEXT:    s_mov_b32 s12, s14
-; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_6_5:
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__4_6_6:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s18
-; GFX90A-NEXT:    s_mov_b32 s9, s19
-; GFX90A-NEXT:    s_mov_b32 s10, s16
-; GFX90A-NEXT:    s_mov_b32 s11, s17
-; GFX90A-NEXT:    s_mov_b32 s12, s14
-; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_6_5:
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__4_6_6:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 6, i32 5>
-  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v3i64_v4i64__u_6_6() {
-; GFX9-LABEL: s_shuffle_v3i64_v4i64__u_6_6:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 poison, i32 6, i32 6>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 4, i32 6, i32 6>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3i64_v4i64__0_6_6() {
-; GFX900-LABEL: s_shuffle_v3i64_v4i64__0_6_6:
+define void @s_shuffle_v3i64_v4i64__5_6_6() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__5_6_6:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s10, s16
-; GFX900-NEXT:    s_mov_b32 s11, s17
-; GFX900-NEXT:    s_mov_b32 s12, s16
-; GFX900-NEXT:    s_mov_b32 s13, s17
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3i64_v4i64__0_6_6:
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__5_6_6:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s10, s16
-; GFX90A-NEXT:    s_mov_b32 s11, s17
-; GFX90A-NEXT:    s_mov_b32 s12, s16
-; GFX90A-NEXT:    s_mov_b32 s13, s17
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3i64_v4i64__0_6_6:
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__5_6_6:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s4
-; GFX942-NEXT:    s_mov_b32 s13, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 0, i32 6, i32 6>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 5, i32 6, i32 6>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3i64_v4i64__1_6_6() {
-; GFX900-LABEL: s_shuffle_v3i64_v4i64__1_6_6:
+define void @s_shuffle_v3i64_v4i64__6_6_6() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__6_6_6:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s6
-; GFX900-NEXT:    s_mov_b32 s9, s7
+; GFX900-NEXT:    s_mov_b32 s8, s12
+; GFX900-NEXT:    s_mov_b32 s9, s13
 ; GFX900-NEXT:    s_mov_b32 s10, s12
 ; GFX900-NEXT:    s_mov_b32 s11, s13
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -14116,17 +14766,14 @@ define void @s_shuffle_v3i64_v4i64__1_6_6() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3i64_v4i64__1_6_6:
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__6_6_6:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s6
-; GFX90A-NEXT:    s_mov_b32 s9, s7
+; GFX90A-NEXT:    s_mov_b32 s8, s12
+; GFX90A-NEXT:    s_mov_b32 s9, s13
 ; GFX90A-NEXT:    s_mov_b32 s10, s12
 ; GFX90A-NEXT:    s_mov_b32 s11, s13
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -14134,42 +14781,34 @@ define void @s_shuffle_v3i64_v4i64__1_6_6() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3i64_v4i64__1_6_6:
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__6_6_6:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 1, i32 6, i32 6>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 6, i32 6, i32 6>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3i64_v4i64__2_6_6() {
-; GFX900-LABEL: s_shuffle_v3i64_v4i64__2_6_6:
+define void @s_shuffle_v3i64_v4i64__7_6_6() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_6_6:
 ; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
-; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s16
-; GFX900-NEXT:    s_mov_b32 s9, s17
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
 ; GFX900-NEXT:    s_mov_b32 s10, s12
 ; GFX900-NEXT:    s_mov_b32 s11, s13
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -14177,17 +14816,14 @@ define void @s_shuffle_v3i64_v4i64__2_6_6() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3i64_v4i64__2_6_6:
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_6_6:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s16
-; GFX90A-NEXT:    s_mov_b32 s9, s17
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
 ; GFX90A-NEXT:    s_mov_b32 s10, s12
 ; GFX90A-NEXT:    s_mov_b32 s11, s13
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -14195,190 +14831,63 @@ define void @s_shuffle_v3i64_v4i64__2_6_6() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3i64_v4i64__2_6_6:
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_6_6:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 2, i32 6, i32 6>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 6, i32 6>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3i64_v4i64__3_6_6() {
-; GFX900-LABEL: s_shuffle_v3i64_v4i64__3_6_6:
+define void @s_shuffle_v3i64_v4i64__7_u_6() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_u_6:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s18
-; GFX900-NEXT:    s_mov_b32 s9, s19
-; GFX900-NEXT:    s_mov_b32 s10, s12
-; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3i64_v4i64__3_6_6:
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_u_6:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s18
-; GFX90A-NEXT:    s_mov_b32 s9, s19
-; GFX90A-NEXT:    s_mov_b32 s10, s12
-; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3i64_v4i64__3_6_6:
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_u_6:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 3, i32 6, i32 6>
-  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v3i64_v4i64__4_6_6() {
-; GFX9-LABEL: s_shuffle_v3i64_v4i64__4_6_6:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 4, i32 6, i32 6>
-  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v3i64_v4i64__5_6_6() {
-; GFX9-LABEL: s_shuffle_v3i64_v4i64__5_6_6:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 5, i32 6, i32 6>
-  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v3i64_v4i64__6_6_6() {
-; GFX9-LABEL: s_shuffle_v3i64_v4i64__6_6_6:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s12
-; GFX9-NEXT:    s_mov_b32 s9, s13
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 6, i32 6, i32 6>
-  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v3i64_v4i64__7_6_6() {
-; GFX9-LABEL: s_shuffle_v3i64_v4i64__7_6_6:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 6, i32 6>
-  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v3i64_v4i64__7_u_6() {
-; GFX9-LABEL: s_shuffle_v3i64_v4i64__7_u_6:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 poison, i32 6>
@@ -14432,10 +14941,8 @@ define void @s_shuffle_v3i64_v4i64__7_0_6() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14493,10 +15000,8 @@ define void @s_shuffle_v3i64_v4i64__7_1_6() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s12, s4
-; GFX942-NEXT:    s_mov_b32 s13, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14554,10 +15059,8 @@ define void @s_shuffle_v3i64_v4i64__7_2_6() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14615,10 +15118,8 @@ define void @s_shuffle_v3i64_v4i64__7_3_6() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14671,12 +15172,9 @@ define void @s_shuffle_v3i64_v4i64__7_4_6() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s4
-; GFX942-NEXT:    s_mov_b32 s13, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14689,18 +15187,43 @@ define void @s_shuffle_v3i64_v4i64__7_4_6() {
 }
 
 define void @s_shuffle_v3i64_v4i64__7_5_6() {
-; GFX9-LABEL: s_shuffle_v3i64_v4i64__7_5_6:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_5_6:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_5_6:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_5_6:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 5, i32 6>
@@ -14745,10 +15268,8 @@ define void @s_shuffle_v3i64_v4i64__u_7_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14806,10 +15327,8 @@ define void @s_shuffle_v3i64_v4i64__0_7_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14871,12 +15390,9 @@ define void @s_shuffle_v3i64_v4i64__1_7_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14934,12 +15450,9 @@ define void @s_shuffle_v3i64_v4i64__2_7_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -15001,12 +15514,9 @@ define void @s_shuffle_v3i64_v4i64__3_7_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -15019,20 +15529,48 @@ define void @s_shuffle_v3i64_v4i64__3_7_7() {
 }
 
 define void @s_shuffle_v3i64_v4i64__4_7_7() {
-; GFX9-LABEL: s_shuffle_v3i64_v4i64__4_7_7:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s14
-; GFX9-NEXT:    s_mov_b32 s11, s15
-; GFX9-NEXT:    s_mov_b32 s12, s14
-; GFX9-NEXT:    s_mov_b32 s13, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__4_7_7:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__4_7_7:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__4_7_7:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 4, i32 7, i32 7>
@@ -15081,12 +15619,9 @@ define void @s_shuffle_v3i64_v4i64__5_7_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -15139,12 +15674,9 @@ define void @s_shuffle_v3i64_v4i64__6_7_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -15193,10 +15725,8 @@ define void @s_shuffle_v3i64_v4i64__7_u_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -15258,12 +15788,9 @@ define void @s_shuffle_v3i64_v4i64__7_0_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -15321,10 +15848,8 @@ define void @s_shuffle_v3i64_v4i64__7_1_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -15386,12 +15911,9 @@ define void @s_shuffle_v3i64_v4i64__7_2_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -15449,12 +15971,9 @@ define void @s_shuffle_v3i64_v4i64__7_3_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -15507,12 +16026,9 @@ define void @s_shuffle_v3i64_v4i64__7_4_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -15525,20 +16041,48 @@ define void @s_shuffle_v3i64_v4i64__7_4_7() {
 }
 
 define void @s_shuffle_v3i64_v4i64__7_5_7() {
-; GFX9-LABEL: s_shuffle_v3i64_v4i64__7_5_7:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    s_mov_b32 s12, s14
-; GFX9-NEXT:    s_mov_b32 s13, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_5_7:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_5_7:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_5_7:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 5, i32 7>
@@ -15587,12 +16131,9 @@ define void @s_shuffle_v3i64_v4i64__7_6_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3p0.v2p0.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3p0.v2p0.ll
index fe132493ce536..9d557a317986d 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3p0.v2p0.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3p0.v2p0.ll
@@ -45,6 +45,7 @@ define void @v_shuffle_v3p0_v2p0__0_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -84,10 +85,11 @@ define void @v_shuffle_v3p0_v2p0__1_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v3p0_v2p0__1_u_u:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1] offset:16
 ; GFX942-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX942-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX942-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
@@ -140,10 +142,11 @@ define void @v_shuffle_v3p0_v2p0__3_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v3p0_v2p0__3_u_u:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1] offset:16
 ; GFX942-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX942-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX942-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
@@ -203,7 +206,7 @@ define void @v_shuffle_v3p0_v2p0__3_0_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[2:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    global_store_dwordx2 v6, v[0:1], s[0:1] offset:16
 ; GFX942-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX942-NEXT:    v_mov_b32_e32 v3, v5
 ; GFX942-NEXT:    v_mov_b32_e32 v4, v0
@@ -254,14 +257,14 @@ define void @v_shuffle_v3p0_v2p0__3_1_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v3p0_v2p0__3_1_u:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[4:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    global_store_dwordx2 v8, v[0:1], s[0:1] offset:16
 ; GFX942-NEXT:    v_mov_b32_e32 v0, v6
 ; GFX942-NEXT:    v_mov_b32_e32 v1, v7
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
@@ -308,7 +311,7 @@ define void @v_shuffle_v3p0_v2p0__3_2_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    global_store_dwordx2 v6, v[0:1], s[0:1] offset:16
 ; GFX942-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX942-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[2:5], s[0:1]
@@ -351,10 +354,11 @@ define void @v_shuffle_v3p0_v2p0__3_3_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v3p0_v2p0__3_3_u:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1] offset:16
 ; GFX942-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX942-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX942-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
@@ -1472,6 +1476,7 @@ define void @v_shuffle_v3p0_v2p0__0_2_2(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1511,10 +1516,11 @@ define void @v_shuffle_v3p0_v2p0__1_2_2(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v3p0_v2p0__1_2_2:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1] offset:16
 ; GFX942-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX942-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX942-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
@@ -2266,8 +2272,7 @@ define void @s_shuffle_v3p0_v2p0__1_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -2325,8 +2330,7 @@ define void @s_shuffle_v3p0_v2p0__3_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -2384,10 +2388,8 @@ define void @s_shuffle_v3p0_v2p0__3_0_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -2441,8 +2443,7 @@ define void @s_shuffle_v3p0_v2p0__3_1_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -2491,10 +2492,8 @@ define void @s_shuffle_v3p0_v2p0__3_2_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -2507,18 +2506,43 @@ define void @s_shuffle_v3p0_v2p0__3_2_u() {
 }
 
 define void @s_shuffle_v3p0_v2p0__3_3_u() {
-; GFX9-LABEL: s_shuffle_v3p0_v2p0__3_3_u:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v3p0_v2p0__3_3_u:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3p0_v2p0__3_3_u:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3p0_v2p0__3_3_u:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x ptr> asm "; def $0", "=s"()
   %vec1 = call <2 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <3 x i32> <i32 3, i32 3, i32 poison>
@@ -2572,10 +2596,8 @@ define void @s_shuffle_v3p0_v2p0__3_3_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -2633,10 +2655,8 @@ define void @s_shuffle_v3p0_v2p0__3_3_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -2689,12 +2709,9 @@ define void @s_shuffle_v3p0_v2p0__3_3_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s2
-; GFX942-NEXT:    s_mov_b32 s11, s3
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -2707,20 +2724,48 @@ define void @s_shuffle_v3p0_v2p0__3_3_2() {
 }
 
 define void @s_shuffle_v3p0_v2p0__3_3_3() {
-; GFX9-LABEL: s_shuffle_v3p0_v2p0__3_3_3:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v3p0_v2p0__3_3_3:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3p0_v2p0__3_3_3:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3p0_v2p0__3_3_3:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x ptr> asm "; def $0", "=s"()
   %vec1 = call <2 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <3 x i32> <i32 3, i32 3, i32 3>
@@ -2765,10 +2810,8 @@ define void @s_shuffle_v3p0_v2p0__u_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -2780,20 +2823,48 @@ define void @s_shuffle_v3p0_v2p0__u_0_0() {
 }
 
 define void @s_shuffle_v3p0_v2p0__0_0_0() {
-; GFX9-LABEL: s_shuffle_v3p0_v2p0__0_0_0:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s8
-; GFX9-NEXT:    s_mov_b32 s11, s9
-; GFX9-NEXT:    s_mov_b32 s12, s8
-; GFX9-NEXT:    s_mov_b32 s13, s9
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v3p0_v2p0__0_0_0:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s8
+; GFX900-NEXT:    s_mov_b32 s11, s9
+; GFX900-NEXT:    s_mov_b32 s12, s8
+; GFX900-NEXT:    s_mov_b32 s13, s9
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3p0_v2p0__0_0_0:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s8
+; GFX90A-NEXT:    s_mov_b32 s11, s9
+; GFX90A-NEXT:    s_mov_b32 s12, s8
+; GFX90A-NEXT:    s_mov_b32 s13, s9
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3p0_v2p0__0_0_0:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[8:9]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[8:9]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> poison, <3 x i32> zeroinitializer
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
@@ -2841,12 +2912,9 @@ define void @s_shuffle_v3p0_v2p0__1_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -2894,10 +2962,8 @@ define void @s_shuffle_v3p0_v2p0__2_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -2958,12 +3024,9 @@ define void @s_shuffle_v3p0_v2p0__3_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3021,10 +3084,8 @@ define void @s_shuffle_v3p0_v2p0__3_u_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3086,12 +3147,9 @@ define void @s_shuffle_v3p0_v2p0__3_1_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s2
-; GFX942-NEXT:    s_mov_b32 s11, s3
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3153,12 +3211,9 @@ define void @s_shuffle_v3p0_v2p0__3_2_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3171,18 +3226,43 @@ define void @s_shuffle_v3p0_v2p0__3_2_0() {
 }
 
 define void @s_shuffle_v3p0_v2p0__u_1_1() {
-; GFX9-LABEL: s_shuffle_v3p0_v2p0__u_1_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v3p0_v2p0__u_1_1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3p0_v2p0__u_1_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3p0_v2p0__u_1_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> poison, <3 x i32> <i32 poison, i32 1, i32 1>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
@@ -3190,18 +3270,43 @@ define void @s_shuffle_v3p0_v2p0__u_1_1() {
 }
 
 define void @s_shuffle_v3p0_v2p0__0_1_1() {
-; GFX9-LABEL: s_shuffle_v3p0_v2p0__0_1_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v3p0_v2p0__0_1_1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3p0_v2p0__0_1_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3p0_v2p0__0_1_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> poison, <3 x i32> <i32 0, i32 1, i32 1>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
@@ -3209,20 +3314,48 @@ define void @s_shuffle_v3p0_v2p0__0_1_1() {
 }
 
 define void @s_shuffle_v3p0_v2p0__1_1_1() {
-; GFX9-LABEL: s_shuffle_v3p0_v2p0__1_1_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v3p0_v2p0__1_1_1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3p0_v2p0__1_1_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3p0_v2p0__1_1_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> poison, <3 x i32> <i32 1, i32 1, i32 1>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
@@ -3230,18 +3363,43 @@ define void @s_shuffle_v3p0_v2p0__1_1_1() {
 }
 
 define void @s_shuffle_v3p0_v2p0__2_1_1() {
-; GFX9-LABEL: s_shuffle_v3p0_v2p0__2_1_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v3p0_v2p0__2_1_1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3p0_v2p0__2_1_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3p0_v2p0__2_1_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> poison, <3 x i32> <i32 2, i32 1, i32 1>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
@@ -3294,10 +3452,8 @@ define void @s_shuffle_v3p0_v2p0__3_1_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s12, s10
-; GFX942-NEXT:    s_mov_b32 s13, s11
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3355,10 +3511,8 @@ define void @s_shuffle_v3p0_v2p0__3_u_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3420,12 +3574,9 @@ define void @s_shuffle_v3p0_v2p0__3_0_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3487,12 +3638,9 @@ define void @s_shuffle_v3p0_v2p0__3_2_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3591,8 +3739,7 @@ define void @s_shuffle_v3p0_v2p0__1_2_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3658,12 +3805,9 @@ define void @s_shuffle_v3p0_v2p0__3_2_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3712,10 +3856,8 @@ define void @s_shuffle_v3p0_v2p0__3_u_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3773,12 +3915,9 @@ define void @s_shuffle_v3p0_v2p0__3_0_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s4
-; GFX942-NEXT:    s_mov_b32 s13, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3836,10 +3975,8 @@ define void @s_shuffle_v3p0_v2p0__3_1_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3852,18 +3989,43 @@ define void @s_shuffle_v3p0_v2p0__3_1_2() {
 }
 
 define void @s_shuffle_v3p0_v2p0__u_3_3() {
-; GFX9-LABEL: s_shuffle_v3p0_v2p0__u_3_3:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v3p0_v2p0__u_3_3:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3p0_v2p0__u_3_3:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3p0_v2p0__u_3_3:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x ptr> asm "; def $0", "=s"()
   %vec1 = call <2 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <3 x i32> <i32 poison, i32 3, i32 3>
@@ -3917,10 +4079,8 @@ define void @s_shuffle_v3p0_v2p0__0_3_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s2
-; GFX942-NEXT:    s_mov_b32 s11, s3
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3978,10 +4138,8 @@ define void @s_shuffle_v3p0_v2p0__1_3_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s12, s10
-; GFX942-NEXT:    s_mov_b32 s13, s11
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3994,18 +4152,43 @@ define void @s_shuffle_v3p0_v2p0__1_3_3() {
 }
 
 define void @s_shuffle_v3p0_v2p0__2_3_3() {
-; GFX9-LABEL: s_shuffle_v3p0_v2p0__2_3_3:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v3p0_v2p0__2_3_3:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3p0_v2p0__2_3_3:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3p0_v2p0__2_3_3:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x ptr> asm "; def $0", "=s"()
   %vec1 = call <2 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <3 x i32> <i32 2, i32 3, i32 3>
@@ -4050,10 +4233,8 @@ define void @s_shuffle_v3p0_v2p0__3_u_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -4115,12 +4296,9 @@ define void @s_shuffle_v3p0_v2p0__3_0_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -4178,10 +4356,8 @@ define void @s_shuffle_v3p0_v2p0__3_1_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -4234,12 +4410,9 @@ define void @s_shuffle_v3p0_v2p0__3_2_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3p0.v3p0.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3p0.v3p0.ll
index b6f4e3091b61f..84cc0535469f3 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3p0.v3p0.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3p0.v3p0.ll
@@ -87,6 +87,7 @@ define void @v_shuffle_v3p0_v3p0__1_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    global_store_dwordx2 v6, v[0:1], s[0:1] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -126,10 +127,11 @@ define void @v_shuffle_v3p0_v3p0__2_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v3p0_v3p0__2_u_u:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-NEXT:    global_store_dwordx2 v6, v[0:1], s[0:1] offset:16
 ; GFX942-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX942-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1]
@@ -182,6 +184,7 @@ define void @v_shuffle_v3p0_v3p0__4_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    global_store_dwordx2 v6, v[0:1], s[0:1] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -222,10 +225,11 @@ define void @v_shuffle_v3p0_v3p0__5_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v3p0_v3p0__5_u_u:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-NEXT:    global_store_dwordx2 v6, v[0:1], s[0:1] offset:16
 ; GFX942-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX942-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1]
@@ -285,7 +289,7 @@ define void @v_shuffle_v3p0_v3p0__5_0_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[2:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    global_store_dwordx2 v8, v[0:1], s[0:1] offset:16
 ; GFX942-NEXT:    v_mov_b32_e32 v2, v6
 ; GFX942-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX942-NEXT:    v_mov_b32_e32 v4, v0
@@ -336,10 +340,11 @@ define void @v_shuffle_v3p0_v3p0__5_1_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v3p0_v3p0__5_1_u:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v10, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_mov_b32_e32 v10, 0
+; GFX942-NEXT:    global_store_dwordx2 v10, v[0:1], s[0:1] offset:16
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[4:9]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -399,7 +404,7 @@ define void @v_shuffle_v3p0_v3p0__5_2_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[6:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    global_store_dwordx2 v12, v[0:1], s[0:1] offset:16
 ; GFX942-NEXT:    v_mov_b32_e32 v2, v10
 ; GFX942-NEXT:    v_mov_b32_e32 v3, v11
 ; GFX942-NEXT:    global_store_dwordx4 v12, v[2:5], s[0:1]
@@ -454,6 +459,7 @@ define void @v_shuffle_v3p0_v3p0__5_3_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    v_mov_b32_e32 v3, v5
 ; GFX942-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX942-NEXT:    v_mov_b32_e32 v5, v1
+; GFX942-NEXT:    global_store_dwordx2 v6, v[0:1], s[0:1] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -494,10 +500,11 @@ define void @v_shuffle_v3p0_v3p0__5_4_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v3p0_v3p0__5_4_u:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-NEXT:    global_store_dwordx2 v6, v[0:1], s[0:1] offset:16
 ; GFX942-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX942-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1]
@@ -546,6 +553,7 @@ define void @v_shuffle_v3p0_v3p0__5_5_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX942-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX942-NEXT:    v_mov_b32_e32 v3, v5
+; GFX942-NEXT:    global_store_dwordx2 v6, v[0:1], s[0:1] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -2902,6 +2910,7 @@ define void @v_shuffle_v3p0_v3p0__1_3_3(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    global_store_dwordx2 v6, v[0:1], s[0:1] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -2941,10 +2950,11 @@ define void @v_shuffle_v3p0_v3p0__2_3_3(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v3p0_v3p0__2_3_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-NEXT:    global_store_dwordx2 v6, v[0:1], s[0:1] offset:16
 ; GFX942-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX942-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1]
@@ -4720,8 +4730,7 @@ define void @s_shuffle_v3p0_v3p0__1_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -4761,8 +4770,7 @@ define void @s_shuffle_v3p0_v3p0__2_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -4820,8 +4828,7 @@ define void @s_shuffle_v3p0_v3p0__4_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -4862,8 +4869,7 @@ define void @s_shuffle_v3p0_v3p0__5_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -4918,11 +4924,11 @@ define void @s_shuffle_v3p0_v3p0__5_0_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s0
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:9]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -4972,8 +4978,7 @@ define void @s_shuffle_v3p0_v3p0__5_1_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5027,10 +5032,8 @@ define void @s_shuffle_v3p0_v3p0__5_2_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5075,10 +5078,8 @@ define void @s_shuffle_v3p0_v3p0__5_3_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5091,18 +5092,43 @@ define void @s_shuffle_v3p0_v3p0__5_3_u() {
 }
 
 define void @s_shuffle_v3p0_v3p0__5_4_u() {
-; GFX9-LABEL: s_shuffle_v3p0_v3p0__5_4_u:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s12
-; GFX9-NEXT:    s_mov_b32 s9, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v3p0_v3p0__5_4_u:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s12
+; GFX900-NEXT:    s_mov_b32 s9, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3p0_v3p0__5_4_u:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s12
+; GFX90A-NEXT:    s_mov_b32 s9, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3p0_v3p0__5_4_u:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x ptr> asm "; def $0", "=s"()
   %vec1 = call <3 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> <i32 5, i32 4, i32 poison>
@@ -5147,10 +5173,8 @@ define void @s_shuffle_v3p0_v3p0__5_5_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5212,12 +5236,9 @@ define void @s_shuffle_v3p0_v3p0__5_5_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5279,12 +5300,9 @@ define void @s_shuffle_v3p0_v3p0__5_5_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5342,10 +5360,8 @@ define void @s_shuffle_v3p0_v3p0__5_5_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5394,12 +5410,9 @@ define void @s_shuffle_v3p0_v3p0__5_5_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5452,12 +5465,9 @@ define void @s_shuffle_v3p0_v3p0__5_5_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5470,20 +5480,48 @@ define void @s_shuffle_v3p0_v3p0__5_5_4() {
 }
 
 define void @s_shuffle_v3p0_v3p0__5_5_5() {
-; GFX9-LABEL: s_shuffle_v3p0_v3p0__5_5_5:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s12
-; GFX9-NEXT:    s_mov_b32 s9, s13
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v3p0_v3p0__5_5_5:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s12
+; GFX900-NEXT:    s_mov_b32 s9, s13
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3p0_v3p0__5_5_5:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s12
+; GFX90A-NEXT:    s_mov_b32 s9, s13
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3p0_v3p0__5_5_5:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x ptr> asm "; def $0", "=s"()
   %vec1 = call <3 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> <i32 5, i32 5, i32 5>
@@ -5528,10 +5566,8 @@ define void @s_shuffle_v3p0_v3p0__u_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5543,20 +5579,48 @@ define void @s_shuffle_v3p0_v3p0__u_0_0() {
 }
 
 define void @s_shuffle_v3p0_v3p0__0_0_0() {
-; GFX9-LABEL: s_shuffle_v3p0_v3p0__0_0_0:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s8
-; GFX9-NEXT:    s_mov_b32 s11, s9
-; GFX9-NEXT:    s_mov_b32 s12, s8
-; GFX9-NEXT:    s_mov_b32 s13, s9
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v3p0_v3p0__0_0_0:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s8
+; GFX900-NEXT:    s_mov_b32 s11, s9
+; GFX900-NEXT:    s_mov_b32 s12, s8
+; GFX900-NEXT:    s_mov_b32 s13, s9
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3p0_v3p0__0_0_0:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s8
+; GFX90A-NEXT:    s_mov_b32 s11, s9
+; GFX90A-NEXT:    s_mov_b32 s12, s8
+; GFX90A-NEXT:    s_mov_b32 s13, s9
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3p0_v3p0__0_0_0:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[8:9]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[8:9]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <3 x i32> zeroinitializer
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
@@ -5604,12 +5668,9 @@ define void @s_shuffle_v3p0_v3p0__1_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5657,12 +5718,9 @@ define void @s_shuffle_v3p0_v3p0__2_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5710,10 +5768,8 @@ define void @s_shuffle_v3p0_v3p0__3_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5771,15 +5827,12 @@ define void @s_shuffle_v3p0_v3p0__4_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s0
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:9]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5838,13 +5891,11 @@ define void @s_shuffle_v3p0_v3p0__5_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s0
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:9]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5899,11 +5950,11 @@ define void @s_shuffle_v3p0_v3p0__5_u_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s0
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:9]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5962,13 +6013,11 @@ define void @s_shuffle_v3p0_v3p0__5_1_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s2
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:9]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s11, s3
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -6022,12 +6071,9 @@ define void @s_shuffle_v3p0_v3p0__5_2_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -6086,13 +6132,11 @@ define void @s_shuffle_v3p0_v3p0__5_3_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s0
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:9]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -6150,10 +6194,8 @@ define void @s_shuffle_v3p0_v3p0__5_4_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -6166,116 +6208,12 @@ define void @s_shuffle_v3p0_v3p0__5_4_0() {
 }
 
 define void @s_shuffle_v3p0_v3p0__u_1_1() {
-; GFX9-LABEL: s_shuffle_v3p0_v3p0__u_1_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <3 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <3 x i32> <i32 poison, i32 1, i32 1>
-  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v3p0_v3p0__0_1_1() {
-; GFX9-LABEL: s_shuffle_v3p0_v3p0__0_1_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <3 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <3 x i32> <i32 0, i32 1, i32 1>
-  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v3p0_v3p0__1_1_1() {
-; GFX9-LABEL: s_shuffle_v3p0_v3p0__1_1_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <3 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <3 x i32> <i32 1, i32 1, i32 1>
-  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v3p0_v3p0__2_1_1() {
-; GFX9-LABEL: s_shuffle_v3p0_v3p0__2_1_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s12
-; GFX9-NEXT:    s_mov_b32 s9, s13
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <3 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <3 x i32> <i32 2, i32 1, i32 1>
-  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v3p0_v3p0__3_1_1() {
-; GFX9-LABEL: s_shuffle_v3p0_v3p0__3_1_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <3 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <3 x i32> <i32 3, i32 1, i32 1>
-  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v3p0_v3p0__4_1_1() {
-; GFX900-LABEL: s_shuffle_v3p0_v3p0__4_1_1:
+; GFX900-LABEL: s_shuffle_v3p0_v3p0__u_1_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:9]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s6
-; GFX900-NEXT:    s_mov_b32 s9, s7
 ; GFX900-NEXT:    s_mov_b32 s12, s10
 ; GFX900-NEXT:    s_mov_b32 s13, s11
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -6283,17 +6221,12 @@ define void @s_shuffle_v3p0_v3p0__4_1_1() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3p0_v3p0__4_1_1:
+; GFX90A-LABEL: s_shuffle_v3p0_v3p0__u_1_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:9]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s6
-; GFX90A-NEXT:    s_mov_b32 s9, s7
 ; GFX90A-NEXT:    s_mov_b32 s12, s10
 ; GFX90A-NEXT:    s_mov_b32 s13, s11
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -6301,40 +6234,30 @@ define void @s_shuffle_v3p0_v3p0__4_1_1() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3p0_v3p0__4_1_1:
+; GFX942-LABEL: s_shuffle_v3p0_v3p0__u_1_1:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:5]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s12, s10
-; GFX942-NEXT:    s_mov_b32 s13, s11
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x ptr> asm "; def $0", "=s"()
-  %vec1 = call <3 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> <i32 4, i32 1, i32 1>
+  %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <3 x i32> <i32 poison, i32 1, i32 1>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3p0_v3p0__5_1_1() {
-; GFX900-LABEL: s_shuffle_v3p0_v3p0__5_1_1:
+define void @s_shuffle_v3p0_v3p0__0_1_1() {
+; GFX900-LABEL: s_shuffle_v3p0_v3p0__0_1_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:9]
-; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s12, s10
 ; GFX900-NEXT:    s_mov_b32 s13, s11
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -6342,15 +6265,12 @@ define void @s_shuffle_v3p0_v3p0__5_1_1() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3p0_v3p0__5_1_1:
+; GFX90A-LABEL: s_shuffle_v3p0_v3p0__0_1_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:9]
-; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s12, s10
 ; GFX90A-NEXT:    s_mov_b32 s13, s11
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -6358,19 +6278,268 @@ define void @s_shuffle_v3p0_v3p0__5_1_1() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3p0_v3p0__5_1_1:
+; GFX942-LABEL: s_shuffle_v3p0_v3p0__0_1_1:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:5]
+; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s12, s10
-; GFX942-NEXT:    s_mov_b32 s13, s11
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <3 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <3 x i32> <i32 0, i32 1, i32 1>
+  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v3p0_v3p0__1_1_1() {
+; GFX900-LABEL: s_shuffle_v3p0_v3p0__1_1_1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3p0_v3p0__1_1_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3p0_v3p0__1_1_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <3 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <3 x i32> <i32 1, i32 1, i32 1>
+  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v3p0_v3p0__2_1_1() {
+; GFX900-LABEL: s_shuffle_v3p0_v3p0__2_1_1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s12
+; GFX900-NEXT:    s_mov_b32 s9, s13
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3p0_v3p0__2_1_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s12
+; GFX90A-NEXT:    s_mov_b32 s9, s13
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3p0_v3p0__2_1_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <3 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <3 x i32> <i32 2, i32 1, i32 1>
+  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v3p0_v3p0__3_1_1() {
+; GFX900-LABEL: s_shuffle_v3p0_v3p0__3_1_1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3p0_v3p0__3_1_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3p0_v3p0__3_1_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <3 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <3 x i32> <i32 3, i32 1, i32 1>
+  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v3p0_v3p0__4_1_1() {
+; GFX900-LABEL: s_shuffle_v3p0_v3p0__4_1_1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:9]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s6
+; GFX900-NEXT:    s_mov_b32 s9, s7
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3p0_v3p0__4_1_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:9]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s6
+; GFX90A-NEXT:    s_mov_b32 s9, s7
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3p0_v3p0__4_1_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:5]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <3 x ptr> asm "; def $0", "=s"()
+  %vec1 = call <3 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> <i32 4, i32 1, i32 1>
+  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v3p0_v3p0__5_1_1() {
+; GFX900-LABEL: s_shuffle_v3p0_v3p0__5_1_1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:9]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3p0_v3p0__5_1_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:9]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3p0_v3p0__5_1_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:5]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -6425,11 +6594,11 @@ define void @s_shuffle_v3p0_v3p0__5_u_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s2
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:9]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -6488,13 +6657,11 @@ define void @s_shuffle_v3p0_v3p0__5_0_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s0
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:9]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -6552,12 +6719,9 @@ define void @s_shuffle_v3p0_v3p0__5_2_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -6577,217 +6741,344 @@ define void @s_shuffle_v3p0_v3p0__5_3_1() {
 ; GFX900-NEXT:    ; def s[4:9]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:17]
+; GFX900-NEXT:    ; def s[12:17]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s16
+; GFX900-NEXT:    s_mov_b32 s9, s17
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s12, s6
+; GFX900-NEXT:    s_mov_b32 s13, s7
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3p0_v3p0__5_3_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:9]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[12:17]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s16
+; GFX90A-NEXT:    s_mov_b32 s9, s17
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s12, s6
+; GFX90A-NEXT:    s_mov_b32 s13, s7
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3p0_v3p0__5_3_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:5]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[4:9]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <3 x ptr> asm "; def $0", "=s"()
+  %vec1 = call <3 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> <i32 5, i32 3, i32 1>
+  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v3p0_v3p0__5_4_1() {
+; GFX900-LABEL: s_shuffle_v3p0_v3p0__5_4_1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:9]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s12
+; GFX900-NEXT:    s_mov_b32 s9, s13
+; GFX900-NEXT:    s_mov_b32 s12, s6
+; GFX900-NEXT:    s_mov_b32 s13, s7
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3p0_v3p0__5_4_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:9]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s12
+; GFX90A-NEXT:    s_mov_b32 s9, s13
+; GFX90A-NEXT:    s_mov_b32 s12, s6
+; GFX90A-NEXT:    s_mov_b32 s13, s7
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3p0_v3p0__5_4_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:5]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <3 x ptr> asm "; def $0", "=s"()
+  %vec1 = call <3 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> <i32 5, i32 4, i32 1>
+  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v3p0_v3p0__u_2_2() {
+; GFX900-LABEL: s_shuffle_v3p0_v3p0__u_2_2:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3p0_v3p0__u_2_2:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3p0_v3p0__u_2_2:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <3 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <3 x i32> <i32 poison, i32 2, i32 2>
+  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v3p0_v3p0__0_2_2() {
+; GFX900-LABEL: s_shuffle_v3p0_v3p0__0_2_2:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3p0_v3p0__0_2_2:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3p0_v3p0__0_2_2:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <3 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <3 x i32> <i32 0, i32 2, i32 2>
+  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v3p0_v3p0__1_2_2() {
+; GFX900-LABEL: s_shuffle_v3p0_v3p0__1_2_2:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3p0_v3p0__1_2_2:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3p0_v3p0__1_2_2:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <3 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <3 x i32> <i32 1, i32 2, i32 2>
+  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v3p0_v3p0__2_2_2() {
+; GFX900-LABEL: s_shuffle_v3p0_v3p0__2_2_2:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s16
-; GFX900-NEXT:    s_mov_b32 s9, s17
+; GFX900-NEXT:    s_mov_b32 s8, s12
+; GFX900-NEXT:    s_mov_b32 s9, s13
 ; GFX900-NEXT:    s_mov_b32 s10, s12
 ; GFX900-NEXT:    s_mov_b32 s11, s13
-; GFX900-NEXT:    s_mov_b32 s12, s6
-; GFX900-NEXT:    s_mov_b32 s13, s7
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3p0_v3p0__5_3_1:
+; GFX90A-LABEL: s_shuffle_v3p0_v3p0__2_2_2:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:9]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:17]
+; GFX90A-NEXT:    ; def s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s16
-; GFX90A-NEXT:    s_mov_b32 s9, s17
+; GFX90A-NEXT:    s_mov_b32 s8, s12
+; GFX90A-NEXT:    s_mov_b32 s9, s13
 ; GFX90A-NEXT:    s_mov_b32 s10, s12
 ; GFX90A-NEXT:    s_mov_b32 s11, s13
-; GFX90A-NEXT:    s_mov_b32 s12, s6
-; GFX90A-NEXT:    s_mov_b32 s13, s7
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3p0_v3p0__5_3_1:
+; GFX942-LABEL: s_shuffle_v3p0_v3p0__2_2_2:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:5]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[4:9]
+; GFX942-NEXT:    ; def s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x ptr> asm "; def $0", "=s"()
-  %vec1 = call <3 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> <i32 5, i32 3, i32 1>
+  %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <3 x i32> <i32 2, i32 2, i32 2>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3p0_v3p0__5_4_1() {
-; GFX900-LABEL: s_shuffle_v3p0_v3p0__5_4_1:
+define void @s_shuffle_v3p0_v3p0__3_2_2() {
+; GFX900-LABEL: s_shuffle_v3p0_v3p0__3_2_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:9]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s12
-; GFX900-NEXT:    s_mov_b32 s9, s13
-; GFX900-NEXT:    s_mov_b32 s12, s6
-; GFX900-NEXT:    s_mov_b32 s13, s7
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3p0_v3p0__5_4_1:
+; GFX90A-LABEL: s_shuffle_v3p0_v3p0__3_2_2:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:9]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s12
-; GFX90A-NEXT:    s_mov_b32 s9, s13
-; GFX90A-NEXT:    s_mov_b32 s12, s6
-; GFX90A-NEXT:    s_mov_b32 s13, s7
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3p0_v3p0__5_4_1:
+; GFX942-LABEL: s_shuffle_v3p0_v3p0__3_2_2:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:5]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <3 x ptr> asm "; def $0", "=s"()
-  %vec1 = call <3 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> <i32 5, i32 4, i32 1>
-  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v3p0_v3p0__u_2_2() {
-; GFX9-LABEL: s_shuffle_v3p0_v3p0__u_2_2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <3 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <3 x i32> <i32 poison, i32 2, i32 2>
-  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v3p0_v3p0__0_2_2() {
-; GFX9-LABEL: s_shuffle_v3p0_v3p0__0_2_2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <3 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <3 x i32> <i32 0, i32 2, i32 2>
-  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v3p0_v3p0__1_2_2() {
-; GFX9-LABEL: s_shuffle_v3p0_v3p0__1_2_2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <3 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <3 x i32> <i32 1, i32 2, i32 2>
-  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v3p0_v3p0__2_2_2() {
-; GFX9-LABEL: s_shuffle_v3p0_v3p0__2_2_2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s12
-; GFX9-NEXT:    s_mov_b32 s9, s13
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <3 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <3 x i32> <i32 2, i32 2, i32 2>
-  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v3p0_v3p0__3_2_2() {
-; GFX9-LABEL: s_shuffle_v3p0_v3p0__3_2_2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <3 x i32> <i32 3, i32 2, i32 2>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
@@ -6840,10 +7131,8 @@ define void @s_shuffle_v3p0_v3p0__4_2_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -6897,10 +7186,8 @@ define void @s_shuffle_v3p0_v3p0__5_2_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -6950,8 +7237,7 @@ define void @s_shuffle_v3p0_v3p0__5_u_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -7009,12 +7295,9 @@ define void @s_shuffle_v3p0_v3p0__5_0_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s4
-; GFX942-NEXT:    s_mov_b32 s13, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -7064,8 +7347,7 @@ define void @s_shuffle_v3p0_v3p0__5_1_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -7119,10 +7401,8 @@ define void @s_shuffle_v3p0_v3p0__5_3_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -7180,10 +7460,8 @@ define void @s_shuffle_v3p0_v3p0__5_4_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s12, s4
-; GFX942-NEXT:    s_mov_b32 s13, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -7282,8 +7560,7 @@ define void @s_shuffle_v3p0_v3p0__1_3_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -7323,8 +7600,7 @@ define void @s_shuffle_v3p0_v3p0__2_3_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -7390,12 +7666,9 @@ define void @s_shuffle_v3p0_v3p0__4_3_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -7444,12 +7717,9 @@ define void @s_shuffle_v3p0_v3p0__5_3_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -7494,10 +7764,8 @@ define void @s_shuffle_v3p0_v3p0__5_u_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -7552,13 +7820,11 @@ define void @s_shuffle_v3p0_v3p0__5_0_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s0
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:9]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s4
-; GFX942-NEXT:    s_mov_b32 s13, s5
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -7612,10 +7878,8 @@ define void @s_shuffle_v3p0_v3p0__5_1_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -7673,10 +7937,8 @@ define void @s_shuffle_v3p0_v3p0__5_2_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[12:17]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s16
-; GFX942-NEXT:    s_mov_b32 s9, s17
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[16:17]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -7725,12 +7987,9 @@ define void @s_shuffle_v3p0_v3p0__5_4_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s2
-; GFX942-NEXT:    s_mov_b32 s11, s3
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -7743,18 +8002,43 @@ define void @s_shuffle_v3p0_v3p0__5_4_3() {
 }
 
 define void @s_shuffle_v3p0_v3p0__u_4_4() {
-; GFX9-LABEL: s_shuffle_v3p0_v3p0__u_4_4:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v3p0_v3p0__u_4_4:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3p0_v3p0__u_4_4:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3p0_v3p0__u_4_4:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x ptr> asm "; def $0", "=s"()
   %vec1 = call <3 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> <i32 poison, i32 4, i32 4>
@@ -7808,10 +8092,8 @@ define void @s_shuffle_v3p0_v3p0__0_4_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s2
-; GFX942-NEXT:    s_mov_b32 s11, s3
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -7869,10 +8151,8 @@ define void @s_shuffle_v3p0_v3p0__1_4_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s12, s10
-; GFX942-NEXT:    s_mov_b32 s13, s11
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -7930,10 +8210,8 @@ define void @s_shuffle_v3p0_v3p0__2_4_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s12, s10
-; GFX942-NEXT:    s_mov_b32 s13, s11
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -7946,18 +8224,43 @@ define void @s_shuffle_v3p0_v3p0__2_4_4() {
 }
 
 define void @s_shuffle_v3p0_v3p0__3_4_4() {
-; GFX9-LABEL: s_shuffle_v3p0_v3p0__3_4_4:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v3p0_v3p0__3_4_4:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3p0_v3p0__3_4_4:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3p0_v3p0__3_4_4:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x ptr> asm "; def $0", "=s"()
   %vec1 = call <3 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> <i32 3, i32 4, i32 4>
@@ -7966,20 +8269,48 @@ define void @s_shuffle_v3p0_v3p0__3_4_4() {
 }
 
 define void @s_shuffle_v3p0_v3p0__4_4_4() {
-; GFX9-LABEL: s_shuffle_v3p0_v3p0__4_4_4:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v3p0_v3p0__4_4_4:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3p0_v3p0__4_4_4:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3p0_v3p0__4_4_4:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x ptr> asm "; def $0", "=s"()
   %vec1 = call <3 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> <i32 4, i32 4, i32 4>
@@ -7988,20 +8319,48 @@ define void @s_shuffle_v3p0_v3p0__4_4_4() {
 }
 
 define void @s_shuffle_v3p0_v3p0__5_4_4() {
-; GFX9-LABEL: s_shuffle_v3p0_v3p0__5_4_4:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s12
-; GFX9-NEXT:    s_mov_b32 s9, s13
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v3p0_v3p0__5_4_4:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s12
+; GFX900-NEXT:    s_mov_b32 s9, s13
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3p0_v3p0__5_4_4:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s12
+; GFX90A-NEXT:    s_mov_b32 s9, s13
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3p0_v3p0__5_4_4:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x ptr> asm "; def $0", "=s"()
   %vec1 = call <3 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> <i32 5, i32 4, i32 4>
@@ -8042,10 +8401,8 @@ define void @s_shuffle_v3p0_v3p0__5_u_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8104,13 +8461,11 @@ define void @s_shuffle_v3p0_v3p0__5_0_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s0
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:9]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8164,10 +8519,8 @@ define void @s_shuffle_v3p0_v3p0__5_1_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8225,12 +8578,9 @@ define void @s_shuffle_v3p0_v3p0__5_2_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s16
-; GFX942-NEXT:    s_mov_b32 s9, s17
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[16:17]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8279,12 +8629,9 @@ define void @s_shuffle_v3p0_v3p0__5_3_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8297,18 +8644,43 @@ define void @s_shuffle_v3p0_v3p0__5_3_4() {
 }
 
 define void @s_shuffle_v3p0_v3p0__u_5_5() {
-; GFX9-LABEL: s_shuffle_v3p0_v3p0__u_5_5:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v3p0_v3p0__u_5_5:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3p0_v3p0__u_5_5:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3p0_v3p0__u_5_5:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x ptr> asm "; def $0", "=s"()
   %vec1 = call <3 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> <i32 poison, i32 5, i32 5>
@@ -8362,10 +8734,8 @@ define void @s_shuffle_v3p0_v3p0__0_5_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s4
-; GFX942-NEXT:    s_mov_b32 s13, s5
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8423,10 +8793,8 @@ define void @s_shuffle_v3p0_v3p0__1_5_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8484,10 +8852,8 @@ define void @s_shuffle_v3p0_v3p0__2_5_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8500,18 +8866,43 @@ define void @s_shuffle_v3p0_v3p0__2_5_5() {
 }
 
 define void @s_shuffle_v3p0_v3p0__3_5_5() {
-; GFX9-LABEL: s_shuffle_v3p0_v3p0__3_5_5:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v3p0_v3p0__3_5_5:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3p0_v3p0__3_5_5:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3p0_v3p0__3_5_5:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x ptr> asm "; def $0", "=s"()
   %vec1 = call <3 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> <i32 3, i32 5, i32 5>
@@ -8520,20 +8911,48 @@ define void @s_shuffle_v3p0_v3p0__3_5_5() {
 }
 
 define void @s_shuffle_v3p0_v3p0__4_5_5() {
-; GFX9-LABEL: s_shuffle_v3p0_v3p0__4_5_5:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v3p0_v3p0__4_5_5:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3p0_v3p0__4_5_5:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3p0_v3p0__4_5_5:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x ptr> asm "; def $0", "=s"()
   %vec1 = call <3 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> <i32 4, i32 5, i32 5>
@@ -8542,18 +8961,43 @@ define void @s_shuffle_v3p0_v3p0__4_5_5() {
 }
 
 define void @s_shuffle_v3p0_v3p0__5_u_5() {
-; GFX9-LABEL: s_shuffle_v3p0_v3p0__5_u_5:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s12
-; GFX9-NEXT:    s_mov_b32 s9, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v3p0_v3p0__5_u_5:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s12
+; GFX900-NEXT:    s_mov_b32 s9, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3p0_v3p0__5_u_5:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s12
+; GFX90A-NEXT:    s_mov_b32 s9, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3p0_v3p0__5_u_5:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x ptr> asm "; def $0", "=s"()
   %vec1 = call <3 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> <i32 5, i32 poison, i32 5>
@@ -8607,10 +9051,8 @@ define void @s_shuffle_v3p0_v3p0__5_0_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8668,10 +9110,8 @@ define void @s_shuffle_v3p0_v3p0__5_1_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s12, s4
-; GFX942-NEXT:    s_mov_b32 s13, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8729,10 +9169,8 @@ define void @s_shuffle_v3p0_v3p0__5_2_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8785,12 +9223,9 @@ define void @s_shuffle_v3p0_v3p0__5_3_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s4
-; GFX942-NEXT:    s_mov_b32 s13, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8803,18 +9238,43 @@ define void @s_shuffle_v3p0_v3p0__5_3_5() {
 }
 
 define void @s_shuffle_v3p0_v3p0__5_4_5() {
-; GFX9-LABEL: s_shuffle_v3p0_v3p0__5_4_5:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s12
-; GFX9-NEXT:    s_mov_b32 s9, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v3p0_v3p0__5_4_5:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s12
+; GFX900-NEXT:    s_mov_b32 s9, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3p0_v3p0__5_4_5:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s12
+; GFX90A-NEXT:    s_mov_b32 s9, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3p0_v3p0__5_4_5:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x ptr> asm "; def $0", "=s"()
   %vec1 = call <3 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> <i32 5, i32 4, i32 5>
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3p0.v4p0.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3p0.v4p0.ll
index b03066e66cf66..3e9a69b3de166 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3p0.v4p0.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3p0.v4p0.ll
@@ -87,6 +87,7 @@ define void @v_shuffle_v3p0_v4p0__1_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    global_store_dwordx2 v8, v[0:1], s[0:1] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -126,6 +127,7 @@ define void @v_shuffle_v3p0_v4p0__2_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    global_store_dwordx2 v8, v[0:1], s[0:1] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -165,10 +167,11 @@ define void @v_shuffle_v3p0_v4p0__3_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v3p0_v4p0__3_u_u:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_mov_b32_e32 v8, 0
+; GFX942-NEXT:    global_store_dwordx2 v8, v[0:1], s[0:1] offset:16
 ; GFX942-NEXT:    v_mov_b32_e32 v0, v6
 ; GFX942-NEXT:    v_mov_b32_e32 v1, v7
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
@@ -221,6 +224,7 @@ define void @v_shuffle_v3p0_v4p0__5_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    global_store_dwordx2 v8, v[0:1], s[0:1] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -261,6 +265,7 @@ define void @v_shuffle_v3p0_v4p0__6_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    global_store_dwordx2 v8, v[0:1], s[0:1] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -301,10 +306,11 @@ define void @v_shuffle_v3p0_v4p0__7_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v3p0_v4p0__7_u_u:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_mov_b32_e32 v8, 0
+; GFX942-NEXT:    global_store_dwordx2 v8, v[0:1], s[0:1] offset:16
 ; GFX942-NEXT:    v_mov_b32_e32 v0, v6
 ; GFX942-NEXT:    v_mov_b32_e32 v1, v7
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
@@ -364,7 +370,7 @@ define void @v_shuffle_v3p0_v4p0__7_0_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[2:9]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    global_store_dwordx2 v10, v[0:1], s[0:1] offset:16
 ; GFX942-NEXT:    v_mov_b32_e32 v2, v8
 ; GFX942-NEXT:    v_mov_b32_e32 v3, v9
 ; GFX942-NEXT:    v_mov_b32_e32 v4, v0
@@ -415,10 +421,11 @@ define void @v_shuffle_v3p0_v4p0__7_1_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v3p0_v4p0__7_1_u:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v12, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_mov_b32_e32 v12, 0
+; GFX942-NEXT:    global_store_dwordx2 v12, v[0:1], s[0:1] offset:16
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -478,7 +485,7 @@ define void @v_shuffle_v3p0_v4p0__7_2_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[6:13]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    global_store_dwordx2 v14, v[0:1], s[0:1] offset:16
 ; GFX942-NEXT:    v_mov_b32_e32 v2, v12
 ; GFX942-NEXT:    v_mov_b32_e32 v3, v13
 ; GFX942-NEXT:    global_store_dwordx4 v14, v[2:5], s[0:1]
@@ -534,7 +541,7 @@ define void @v_shuffle_v3p0_v4p0__7_3_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    global_store_dwordx2 v16, v[0:1], s[0:1] offset:16
 ; GFX942-NEXT:    v_mov_b32_e32 v4, v14
 ; GFX942-NEXT:    v_mov_b32_e32 v5, v15
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1]
@@ -589,6 +596,7 @@ define void @v_shuffle_v3p0_v4p0__7_4_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX942-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX942-NEXT:    v_mov_b32_e32 v5, v1
+; GFX942-NEXT:    global_store_dwordx2 v8, v[0:1], s[0:1] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -629,10 +637,11 @@ define void @v_shuffle_v3p0_v4p0__7_5_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v3p0_v4p0__7_5_u:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_mov_b32_e32 v8, 0
+; GFX942-NEXT:    global_store_dwordx2 v8, v[0:1], s[0:1] offset:16
 ; GFX942-NEXT:    v_mov_b32_e32 v0, v6
 ; GFX942-NEXT:    v_mov_b32_e32 v1, v7
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
@@ -681,6 +690,7 @@ define void @v_shuffle_v3p0_v4p0__7_6_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-NEXT:    v_mov_b32_e32 v2, v6
 ; GFX942-NEXT:    v_mov_b32_e32 v3, v7
+; GFX942-NEXT:    global_store_dwordx2 v8, v[0:1], s[0:1] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -727,6 +737,7 @@ define void @v_shuffle_v3p0_v4p0__7_7_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-NEXT:    v_mov_b32_e32 v4, v6
 ; GFX942-NEXT:    v_mov_b32_e32 v5, v7
+; GFX942-NEXT:    global_store_dwordx2 v8, v[0:1], s[0:1] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -4719,6 +4730,7 @@ define void @v_shuffle_v3p0_v4p0__1_4_4(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    global_store_dwordx2 v8, v[0:1], s[0:1] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -4758,6 +4770,7 @@ define void @v_shuffle_v3p0_v4p0__2_4_4(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    global_store_dwordx2 v8, v[0:1], s[0:1] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -4797,10 +4810,11 @@ define void @v_shuffle_v3p0_v4p0__3_4_4(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v3p0_v4p0__3_4_4:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_mov_b32_e32 v8, 0
+; GFX942-NEXT:    global_store_dwordx2 v8, v[0:1], s[0:1] offset:16
 ; GFX942-NEXT:    v_mov_b32_e32 v0, v6
 ; GFX942-NEXT:    v_mov_b32_e32 v1, v7
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
@@ -8016,8 +8030,7 @@ define void @s_shuffle_v3p0_v4p0__1_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8057,8 +8070,7 @@ define void @s_shuffle_v3p0_v4p0__2_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8102,8 +8114,7 @@ define void @s_shuffle_v3p0_v4p0__3_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8161,8 +8172,7 @@ define void @s_shuffle_v3p0_v4p0__5_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8203,8 +8213,7 @@ define void @s_shuffle_v3p0_v4p0__6_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8249,8 +8258,7 @@ define void @s_shuffle_v3p0_v4p0__7_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8309,10 +8317,8 @@ define void @s_shuffle_v3p0_v4p0__7_0_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8366,8 +8372,7 @@ define void @s_shuffle_v3p0_v4p0__7_1_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8425,10 +8430,8 @@ define void @s_shuffle_v3p0_v4p0__7_2_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8482,10 +8485,8 @@ define void @s_shuffle_v3p0_v4p0__7_3_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8534,10 +8535,8 @@ define void @s_shuffle_v3p0_v4p0__7_4_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8550,18 +8549,43 @@ define void @s_shuffle_v3p0_v4p0__7_4_u() {
 }
 
 define void @s_shuffle_v3p0_v4p0__7_5_u() {
-; GFX9-LABEL: s_shuffle_v3p0_v4p0__7_5_u:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_5_u:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_5_u:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_5_u:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 5, i32 poison>
@@ -8606,10 +8630,8 @@ define void @s_shuffle_v3p0_v4p0__7_6_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8654,10 +8676,8 @@ define void @s_shuffle_v3p0_v4p0__7_7_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8716,13 +8736,11 @@ define void @s_shuffle_v3p0_v4p0__7_7_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s0
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8781,13 +8799,11 @@ define void @s_shuffle_v3p0_v4p0__7_7_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s2
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8841,10 +8857,8 @@ define void @s_shuffle_v3p0_v4p0__7_7_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8902,12 +8916,9 @@ define void @s_shuffle_v3p0_v4p0__7_7_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8956,12 +8967,9 @@ define void @s_shuffle_v3p0_v4p0__7_7_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -9010,12 +9018,9 @@ define void @s_shuffle_v3p0_v4p0__7_7_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -9028,20 +9033,48 @@ define void @s_shuffle_v3p0_v4p0__7_7_5() {
 }
 
 define void @s_shuffle_v3p0_v4p0__7_7_6() {
-; GFX9-LABEL: s_shuffle_v3p0_v4p0__7_7_6:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    s_mov_b32 s10, s14
-; GFX9-NEXT:    s_mov_b32 s11, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_7_6:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_7_6:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_7_6:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 7, i32 6>
@@ -9090,12 +9123,9 @@ define void @s_shuffle_v3p0_v4p0__7_7_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -9144,10 +9174,8 @@ define void @s_shuffle_v3p0_v4p0__u_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -9159,20 +9187,48 @@ define void @s_shuffle_v3p0_v4p0__u_0_0() {
 }
 
 define void @s_shuffle_v3p0_v4p0__0_0_0() {
-; GFX9-LABEL: s_shuffle_v3p0_v4p0__0_0_0:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s8
-; GFX9-NEXT:    s_mov_b32 s11, s9
-; GFX9-NEXT:    s_mov_b32 s12, s8
-; GFX9-NEXT:    s_mov_b32 s13, s9
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__0_0_0:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s8
+; GFX900-NEXT:    s_mov_b32 s11, s9
+; GFX900-NEXT:    s_mov_b32 s12, s8
+; GFX900-NEXT:    s_mov_b32 s13, s9
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__0_0_0:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s8
+; GFX90A-NEXT:    s_mov_b32 s11, s9
+; GFX90A-NEXT:    s_mov_b32 s12, s8
+; GFX90A-NEXT:    s_mov_b32 s13, s9
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__0_0_0:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[8:9]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[8:9]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> zeroinitializer
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
@@ -9220,12 +9276,9 @@ define void @s_shuffle_v3p0_v4p0__1_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -9273,12 +9326,9 @@ define void @s_shuffle_v3p0_v4p0__2_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -9330,12 +9380,9 @@ define void @s_shuffle_v3p0_v4p0__3_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -9383,10 +9430,8 @@ define void @s_shuffle_v3p0_v4p0__4_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -9444,15 +9489,12 @@ define void @s_shuffle_v3p0_v4p0__5_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s0
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -9511,13 +9553,11 @@ define void @s_shuffle_v3p0_v4p0__6_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s0
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -9576,15 +9616,12 @@ define void @s_shuffle_v3p0_v4p0__7_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s0
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -9639,13 +9676,11 @@ define void @s_shuffle_v3p0_v4p0__7_u_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s0
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -9704,15 +9739,12 @@ define void @s_shuffle_v3p0_v4p0__7_1_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s0
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s10, s2
-; GFX942-NEXT:    s_mov_b32 s11, s3
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -9770,12 +9802,9 @@ define void @s_shuffle_v3p0_v4p0__7_2_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -9833,12 +9862,9 @@ define void @s_shuffle_v3p0_v4p0__7_3_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -9897,15 +9923,12 @@ define void @s_shuffle_v3p0_v4p0__7_4_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s0
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -9963,10 +9986,8 @@ define void @s_shuffle_v3p0_v4p0__7_5_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -10028,12 +10049,9 @@ define void @s_shuffle_v3p0_v4p0__7_6_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -10046,137 +10064,12 @@ define void @s_shuffle_v3p0_v4p0__7_6_0() {
 }
 
 define void @s_shuffle_v3p0_v4p0__u_1_1() {
-; GFX9-LABEL: s_shuffle_v3p0_v4p0__u_1_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> <i32 poison, i32 1, i32 1>
-  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v3p0_v4p0__0_1_1() {
-; GFX9-LABEL: s_shuffle_v3p0_v4p0__0_1_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> <i32 0, i32 1, i32 1>
-  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v3p0_v4p0__1_1_1() {
-; GFX9-LABEL: s_shuffle_v3p0_v4p0__1_1_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> <i32 1, i32 1, i32 1>
-  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v3p0_v4p0__2_1_1() {
-; GFX9-LABEL: s_shuffle_v3p0_v4p0__2_1_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s12
-; GFX9-NEXT:    s_mov_b32 s9, s13
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> <i32 2, i32 1, i32 1>
-  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v3p0_v4p0__3_1_1() {
-; GFX9-LABEL: s_shuffle_v3p0_v4p0__3_1_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> <i32 3, i32 1, i32 1>
-  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v3p0_v4p0__4_1_1() {
-; GFX9-LABEL: s_shuffle_v3p0_v4p0__4_1_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> <i32 4, i32 1, i32 1>
-  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v3p0_v4p0__5_1_1() {
-; GFX900-LABEL: s_shuffle_v3p0_v4p0__5_1_1:
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__u_1_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s14
-; GFX900-NEXT:    s_mov_b32 s9, s15
 ; GFX900-NEXT:    s_mov_b32 s12, s10
 ; GFX900-NEXT:    s_mov_b32 s13, s11
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -10184,17 +10077,12 @@ define void @s_shuffle_v3p0_v4p0__5_1_1() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3p0_v4p0__5_1_1:
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__u_1_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s14
-; GFX90A-NEXT:    s_mov_b32 s9, s15
 ; GFX90A-NEXT:    s_mov_b32 s12, s10
 ; GFX90A-NEXT:    s_mov_b32 s13, s11
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -10202,42 +10090,30 @@ define void @s_shuffle_v3p0_v4p0__5_1_1() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3p0_v4p0__5_1_1:
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__u_1_1:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s12, s10
-; GFX942-NEXT:    s_mov_b32 s13, s11
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 5, i32 1, i32 1>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> <i32 poison, i32 1, i32 1>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3p0_v4p0__6_1_1() {
-; GFX900-LABEL: s_shuffle_v3p0_v4p0__6_1_1:
+define void @s_shuffle_v3p0_v4p0__0_1_1() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__0_1_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s16
-; GFX900-NEXT:    s_mov_b32 s9, s17
 ; GFX900-NEXT:    s_mov_b32 s12, s10
 ; GFX900-NEXT:    s_mov_b32 s13, s11
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -10245,17 +10121,12 @@ define void @s_shuffle_v3p0_v4p0__6_1_1() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3p0_v4p0__6_1_1:
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__0_1_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s16
-; GFX90A-NEXT:    s_mov_b32 s9, s17
 ; GFX90A-NEXT:    s_mov_b32 s12, s10
 ; GFX90A-NEXT:    s_mov_b32 s13, s11
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -10263,42 +10134,32 @@ define void @s_shuffle_v3p0_v4p0__6_1_1() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3p0_v4p0__6_1_1:
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__0_1_1:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s12, s10
-; GFX942-NEXT:    s_mov_b32 s13, s11
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 6, i32 1, i32 1>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> <i32 0, i32 1, i32 1>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3p0_v4p0__7_1_1() {
-; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_1_1:
+define void @s_shuffle_v3p0_v4p0__1_1_1() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__1_1_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s18
-; GFX900-NEXT:    s_mov_b32 s9, s19
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
 ; GFX900-NEXT:    s_mov_b32 s12, s10
 ; GFX900-NEXT:    s_mov_b32 s13, s11
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -10306,17 +10167,14 @@ define void @s_shuffle_v3p0_v4p0__7_1_1() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_1_1:
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__1_1_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s18
-; GFX90A-NEXT:    s_mov_b32 s9, s19
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
 ; GFX90A-NEXT:    s_mov_b32 s12, s10
 ; GFX90A-NEXT:    s_mov_b32 s13, s11
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -10324,263 +10182,204 @@ define void @s_shuffle_v3p0_v4p0__7_1_1() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_1_1:
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__1_1_1:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s12, s10
-; GFX942-NEXT:    s_mov_b32 s13, s11
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 1, i32 1>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> <i32 1, i32 1, i32 1>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3p0_v4p0__7_u_1() {
-; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_u_1:
+define void @s_shuffle_v3p0_v4p0__2_1_1() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__2_1_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s14
-; GFX900-NEXT:    s_mov_b32 s9, s15
-; GFX900-NEXT:    s_mov_b32 s12, s6
-; GFX900-NEXT:    s_mov_b32 s13, s7
+; GFX900-NEXT:    s_mov_b32 s8, s12
+; GFX900-NEXT:    s_mov_b32 s9, s13
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_u_1:
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__2_1_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s14
-; GFX90A-NEXT:    s_mov_b32 s9, s15
-; GFX90A-NEXT:    s_mov_b32 s12, s6
-; GFX90A-NEXT:    s_mov_b32 s13, s7
+; GFX90A-NEXT:    s_mov_b32 s8, s12
+; GFX90A-NEXT:    s_mov_b32 s9, s13
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_u_1:
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__2_1_1:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[4:11]
+; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 poison, i32 1>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> <i32 2, i32 1, i32 1>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3p0_v4p0__7_0_1() {
-; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_0_1:
+define void @s_shuffle_v3p0_v4p0__3_1_1() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__3_1_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s14
 ; GFX900-NEXT:    s_mov_b32 s9, s15
-; GFX900-NEXT:    s_mov_b32 s10, s4
-; GFX900-NEXT:    s_mov_b32 s11, s5
-; GFX900-NEXT:    s_mov_b32 s12, s6
-; GFX900-NEXT:    s_mov_b32 s13, s7
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_0_1:
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__3_1_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s14
 ; GFX90A-NEXT:    s_mov_b32 s9, s15
-; GFX90A-NEXT:    s_mov_b32 s10, s4
-; GFX90A-NEXT:    s_mov_b32 s11, s5
-; GFX90A-NEXT:    s_mov_b32 s12, s6
-; GFX90A-NEXT:    s_mov_b32 s13, s7
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_0_1:
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__3_1_1:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[4:11]
+; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 0, i32 1>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> <i32 3, i32 1, i32 1>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3p0_v4p0__7_2_1() {
-; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_2_1:
+define void @s_shuffle_v3p0_v4p0__4_1_1() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__4_1_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s10, s16
-; GFX900-NEXT:    s_mov_b32 s11, s17
-; GFX900-NEXT:    s_mov_b32 s12, s14
-; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_2_1:
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__4_1_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s10, s16
-; GFX90A-NEXT:    s_mov_b32 s11, s17
-; GFX90A-NEXT:    s_mov_b32 s12, s14
-; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_2_1:
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__4_1_1:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 2, i32 1>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> <i32 4, i32 1, i32 1>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3p0_v4p0__7_3_1() {
-; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_3_1:
+define void @s_shuffle_v3p0_v4p0__5_1_1() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__5_1_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s18
-; GFX900-NEXT:    s_mov_b32 s9, s19
-; GFX900-NEXT:    s_mov_b32 s12, s6
-; GFX900-NEXT:    s_mov_b32 s13, s7
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_3_1:
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__5_1_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s18
-; GFX90A-NEXT:    s_mov_b32 s9, s19
-; GFX90A-NEXT:    s_mov_b32 s12, s6
-; GFX90A-NEXT:    s_mov_b32 s13, s7
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_3_1:
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__5_1_1:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -10589,128 +10388,116 @@ define void @s_shuffle_v3p0_v4p0__7_3_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 3, i32 1>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 5, i32 1, i32 1>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3p0_v4p0__7_4_1() {
-; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_4_1:
+define void @s_shuffle_v3p0_v4p0__6_1_1() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__6_1_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s18
-; GFX900-NEXT:    s_mov_b32 s9, s19
-; GFX900-NEXT:    s_mov_b32 s10, s12
-; GFX900-NEXT:    s_mov_b32 s11, s13
-; GFX900-NEXT:    s_mov_b32 s12, s6
-; GFX900-NEXT:    s_mov_b32 s13, s7
+; GFX900-NEXT:    s_mov_b32 s8, s16
+; GFX900-NEXT:    s_mov_b32 s9, s17
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_4_1:
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__6_1_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s18
-; GFX90A-NEXT:    s_mov_b32 s9, s19
-; GFX90A-NEXT:    s_mov_b32 s10, s12
-; GFX90A-NEXT:    s_mov_b32 s11, s13
-; GFX90A-NEXT:    s_mov_b32 s12, s6
-; GFX90A-NEXT:    s_mov_b32 s13, s7
+; GFX90A-NEXT:    s_mov_b32 s8, s16
+; GFX90A-NEXT:    s_mov_b32 s9, s17
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_4_1:
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__6_1_1:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s2
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[4:11]
+; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 4, i32 1>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 6, i32 1, i32 1>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3p0_v4p0__7_5_1() {
-; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_5_1:
+define void @s_shuffle_v3p0_v4p0__7_1_1() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_1_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s14
-; GFX900-NEXT:    s_mov_b32 s9, s15
-; GFX900-NEXT:    s_mov_b32 s12, s6
-; GFX900-NEXT:    s_mov_b32 s13, s7
+; GFX900-NEXT:    s_mov_b32 s8, s18
+; GFX900-NEXT:    s_mov_b32 s9, s19
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_5_1:
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_1_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s14
-; GFX90A-NEXT:    s_mov_b32 s9, s15
-; GFX90A-NEXT:    s_mov_b32 s12, s6
-; GFX90A-NEXT:    s_mov_b32 s13, s7
+; GFX90A-NEXT:    s_mov_b32 s8, s18
+; GFX90A-NEXT:    s_mov_b32 s9, s19
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_5_1:
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_1_1:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -10719,23 +10506,21 @@ define void @s_shuffle_v3p0_v4p0__7_5_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 5, i32 1>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 1, i32 1>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3p0_v4p0__7_6_1() {
-; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_6_1:
+define void @s_shuffle_v3p0_v4p0__7_u_1() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_u_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -10746,8 +10531,6 @@ define void @s_shuffle_v3p0_v4p0__7_6_1() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s14
 ; GFX900-NEXT:    s_mov_b32 s9, s15
-; GFX900-NEXT:    s_mov_b32 s10, s12
-; GFX900-NEXT:    s_mov_b32 s11, s13
 ; GFX900-NEXT:    s_mov_b32 s12, s6
 ; GFX900-NEXT:    s_mov_b32 s13, s7
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -10755,7 +10538,7 @@ define void @s_shuffle_v3p0_v4p0__7_6_1() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_6_1:
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_u_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -10766,8 +10549,6 @@ define void @s_shuffle_v3p0_v4p0__7_6_1() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s14
 ; GFX90A-NEXT:    s_mov_b32 s9, s15
-; GFX90A-NEXT:    s_mov_b32 s10, s12
-; GFX90A-NEXT:    s_mov_b32 s11, s13
 ; GFX90A-NEXT:    s_mov_b32 s12, s6
 ; GFX90A-NEXT:    s_mov_b32 s13, s7
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -10775,190 +10556,134 @@ define void @s_shuffle_v3p0_v4p0__7_6_1() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_6_1:
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_u_1:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 6, i32 1>
-  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v3p0_v4p0__u_2_2() {
-; GFX9-LABEL: s_shuffle_v3p0_v4p0__u_2_2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> <i32 poison, i32 2, i32 2>
-  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v3p0_v4p0__0_2_2() {
-; GFX9-LABEL: s_shuffle_v3p0_v4p0__0_2_2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> <i32 0, i32 2, i32 2>
-  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v3p0_v4p0__1_2_2() {
-; GFX9-LABEL: s_shuffle_v3p0_v4p0__1_2_2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> <i32 1, i32 2, i32 2>
-  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v3p0_v4p0__2_2_2() {
-; GFX9-LABEL: s_shuffle_v3p0_v4p0__2_2_2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s12
-; GFX9-NEXT:    s_mov_b32 s9, s13
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> <i32 2, i32 2, i32 2>
-  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v3p0_v4p0__3_2_2() {
-; GFX9-LABEL: s_shuffle_v3p0_v4p0__3_2_2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> <i32 3, i32 2, i32 2>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 poison, i32 1>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3p0_v4p0__4_2_2() {
-; GFX9-LABEL: s_shuffle_v3p0_v4p0__4_2_2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+define void @s_shuffle_v3p0_v4p0__7_0_1() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_0_1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s10, s4
+; GFX900-NEXT:    s_mov_b32 s11, s5
+; GFX900-NEXT:    s_mov_b32 s12, s6
+; GFX900-NEXT:    s_mov_b32 s13, s7
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_0_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s10, s4
+; GFX90A-NEXT:    s_mov_b32 s11, s5
+; GFX90A-NEXT:    s_mov_b32 s12, s6
+; GFX90A-NEXT:    s_mov_b32 s13, s7
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_0_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[4:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> <i32 4, i32 2, i32 2>
+  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 0, i32 1>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3p0_v4p0__5_2_2() {
-; GFX900-LABEL: s_shuffle_v3p0_v4p0__5_2_2:
+define void @s_shuffle_v3p0_v4p0__7_2_1() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_2_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s6
-; GFX900-NEXT:    s_mov_b32 s9, s7
-; GFX900-NEXT:    s_mov_b32 s10, s12
-; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s10, s16
+; GFX900-NEXT:    s_mov_b32 s11, s17
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3p0_v4p0__5_2_2:
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_2_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s6
-; GFX90A-NEXT:    s_mov_b32 s9, s7
-; GFX90A-NEXT:    s_mov_b32 s10, s12
-; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s10, s16
+; GFX90A-NEXT:    s_mov_b32 s11, s17
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3p0_v4p0__5_2_2:
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_2_1:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -10967,55 +10692,58 @@ define void @s_shuffle_v3p0_v4p0__5_2_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 5, i32 2, i32 2>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 2, i32 1>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3p0_v4p0__6_2_2() {
-; GFX900-LABEL: s_shuffle_v3p0_v4p0__6_2_2:
+define void @s_shuffle_v3p0_v4p0__7_3_1() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_3_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s10, s12
-; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s8, s18
+; GFX900-NEXT:    s_mov_b32 s9, s19
+; GFX900-NEXT:    s_mov_b32 s12, s6
+; GFX900-NEXT:    s_mov_b32 s13, s7
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3p0_v4p0__6_2_2:
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_3_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s10, s12
-; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s8, s18
+; GFX90A-NEXT:    s_mov_b32 s9, s19
+; GFX90A-NEXT:    s_mov_b32 s12, s6
+; GFX90A-NEXT:    s_mov_b32 s13, s7
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3p0_v4p0__6_2_2:
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_3_1:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -11024,238 +10752,1418 @@ define void @s_shuffle_v3p0_v4p0__6_2_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 6, i32 2, i32 2>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 3, i32 1>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3p0_v4p0__7_2_2() {
-; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_2_2:
+define void @s_shuffle_v3p0_v4p0__7_4_1() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_4_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s8, s18
+; GFX900-NEXT:    s_mov_b32 s9, s19
 ; GFX900-NEXT:    s_mov_b32 s10, s12
 ; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s12, s6
+; GFX900-NEXT:    s_mov_b32 s13, s7
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_2_2:
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_4_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s8, s18
+; GFX90A-NEXT:    s_mov_b32 s9, s19
 ; GFX90A-NEXT:    s_mov_b32 s10, s12
 ; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s12, s6
+; GFX90A-NEXT:    s_mov_b32 s13, s7
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_4_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[4:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
+  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 4, i32 1>
+  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v3p0_v4p0__7_5_1() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_5_1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s12, s6
+; GFX900-NEXT:    s_mov_b32 s13, s7
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_5_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s6
+; GFX90A-NEXT:    s_mov_b32 s13, s7
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_5_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
+  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 5, i32 1>
+  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v3p0_v4p0__7_6_1() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_6_1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s12, s6
+; GFX900-NEXT:    s_mov_b32 s13, s7
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_6_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s12, s6
+; GFX90A-NEXT:    s_mov_b32 s13, s7
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_6_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
+  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 6, i32 1>
+  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v3p0_v4p0__u_2_2() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__u_2_2:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__u_2_2:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__u_2_2:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> <i32 poison, i32 2, i32 2>
+  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v3p0_v4p0__0_2_2() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__0_2_2:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__0_2_2:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__0_2_2:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> <i32 0, i32 2, i32 2>
+  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v3p0_v4p0__1_2_2() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__1_2_2:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__1_2_2:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__1_2_2:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> <i32 1, i32 2, i32 2>
+  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v3p0_v4p0__2_2_2() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__2_2_2:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s12
+; GFX900-NEXT:    s_mov_b32 s9, s13
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__2_2_2:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s12
+; GFX90A-NEXT:    s_mov_b32 s9, s13
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__2_2_2:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> <i32 2, i32 2, i32 2>
+  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v3p0_v4p0__3_2_2() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__3_2_2:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__3_2_2:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__3_2_2:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> <i32 3, i32 2, i32 2>
+  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v3p0_v4p0__4_2_2() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__4_2_2:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__4_2_2:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__4_2_2:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> <i32 4, i32 2, i32 2>
+  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v3p0_v4p0__5_2_2() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__5_2_2:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s6
+; GFX900-NEXT:    s_mov_b32 s9, s7
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__5_2_2:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s6
+; GFX90A-NEXT:    s_mov_b32 s9, s7
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__5_2_2:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
+  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 5, i32 2, i32 2>
+  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v3p0_v4p0__6_2_2() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__6_2_2:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__6_2_2:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__6_2_2:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
+  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 6, i32 2, i32 2>
+  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v3p0_v4p0__7_2_2() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_2_2:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_2_2:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_2_2:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
+  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 2, i32 2>
+  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v3p0_v4p0__7_u_2() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_u_2:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_u_2:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_u_2:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
+  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 poison, i32 2>
+  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v3p0_v4p0__7_0_2() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_0_2:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s12, s16
+; GFX900-NEXT:    s_mov_b32 s13, s17
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_0_2:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s12, s16
+; GFX90A-NEXT:    s_mov_b32 s13, s17
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_0_2:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
+  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 0, i32 2>
+  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v3p0_v4p0__7_1_2() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_1_2:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[16:23]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s22
+; GFX900-NEXT:    s_mov_b32 s9, s23
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_1_2:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[16:23]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s22
+; GFX90A-NEXT:    s_mov_b32 s9, s23
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_1_2:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
+  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 1, i32 2>
+  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v3p0_v4p0__7_3_2() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_3_2:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_3_2:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_3_2:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
+  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 3, i32 2>
+  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v3p0_v4p0__7_4_2() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_4_2:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s10, s4
+; GFX900-NEXT:    s_mov_b32 s11, s5
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_4_2:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s10, s4
+; GFX90A-NEXT:    s_mov_b32 s11, s5
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_4_2:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
+  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 4, i32 2>
+  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v3p0_v4p0__7_5_2() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_5_2:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s12, s16
+; GFX900-NEXT:    s_mov_b32 s13, s17
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_5_2:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s16
+; GFX90A-NEXT:    s_mov_b32 s13, s17
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_5_2:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
+  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 5, i32 2>
+  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v3p0_v4p0__7_6_2() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_6_2:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[16:23]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s22
+; GFX900-NEXT:    s_mov_b32 s9, s23
+; GFX900-NEXT:    s_mov_b32 s10, s20
+; GFX900-NEXT:    s_mov_b32 s11, s21
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_6_2:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[16:23]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s22
+; GFX90A-NEXT:    s_mov_b32 s9, s23
+; GFX90A-NEXT:    s_mov_b32 s10, s20
+; GFX90A-NEXT:    s_mov_b32 s11, s21
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_6_2:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
+  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 6, i32 2>
+  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v3p0_v4p0__u_3_3() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__u_3_3:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__u_3_3:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__u_3_3:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> <i32 poison, i32 3, i32 3>
+  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v3p0_v4p0__0_3_3() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__0_3_3:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__0_3_3:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__0_3_3:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> <i32 0, i32 3, i32 3>
+  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v3p0_v4p0__1_3_3() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__1_3_3:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__1_3_3:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__1_3_3:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> <i32 1, i32 3, i32 3>
+  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v3p0_v4p0__2_3_3() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__2_3_3:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s12
+; GFX900-NEXT:    s_mov_b32 s9, s13
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__2_3_3:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s12
+; GFX90A-NEXT:    s_mov_b32 s9, s13
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_2_2:
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__2_3_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[8:15]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 2, i32 2>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> <i32 2, i32 3, i32 3>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3p0_v4p0__7_u_2() {
-; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_u_2:
+define void @s_shuffle_v3p0_v4p0__3_3_3() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__3_3_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_u_2:
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__3_3_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_u_2:
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__3_3_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[8:15]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 poison, i32 2>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> <i32 3, i32 3, i32 3>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3p0_v4p0__7_0_2() {
-; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_0_2:
+define void @s_shuffle_v3p0_v4p0__4_3_3() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__4_3_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s10, s12
-; GFX900-NEXT:    s_mov_b32 s11, s13
-; GFX900-NEXT:    s_mov_b32 s12, s16
-; GFX900-NEXT:    s_mov_b32 s13, s17
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_0_2:
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__4_3_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s10, s12
-; GFX90A-NEXT:    s_mov_b32 s11, s13
-; GFX90A-NEXT:    s_mov_b32 s12, s16
-; GFX90A-NEXT:    s_mov_b32 s13, s17
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_0_2:
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__4_3_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[8:15]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s4
-; GFX942-NEXT:    s_mov_b32 s13, s5
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 0, i32 2>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> <i32 4, i32 3, i32 3>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3p0_v4p0__7_1_2() {
-; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_1_2:
+define void @s_shuffle_v3p0_v4p0__5_3_3() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__5_3_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[16:23]
+; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s22
-; GFX900-NEXT:    s_mov_b32 s9, s23
+; GFX900-NEXT:    s_mov_b32 s8, s6
+; GFX900-NEXT:    s_mov_b32 s9, s7
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_1_2:
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__5_3_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[16:23]
+; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s22
-; GFX90A-NEXT:    s_mov_b32 s9, s23
+; GFX90A-NEXT:    s_mov_b32 s8, s6
+; GFX90A-NEXT:    s_mov_b32 s9, s7
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_1_2:
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__5_3_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -11264,21 +12172,22 @@ define void @s_shuffle_v3p0_v4p0__7_1_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 1, i32 2>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 5, i32 3, i32 3>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3p0_v4p0__7_3_2() {
-; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_3_2:
+define void @s_shuffle_v3p0_v4p0__6_3_3() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__6_3_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -11287,16 +12196,16 @@ define void @s_shuffle_v3p0_v4p0__7_3_2() {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
 ; GFX900-NEXT:    s_mov_b32 s10, s14
 ; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_3_2:
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__6_3_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -11305,16 +12214,16 @@ define void @s_shuffle_v3p0_v4p0__7_3_2() {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
 ; GFX90A-NEXT:    s_mov_b32 s10, s14
 ; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_3_2:
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__6_3_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -11323,23 +12232,22 @@ define void @s_shuffle_v3p0_v4p0__7_3_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 3, i32 2>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 6, i32 3, i32 3>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3p0_v4p0__7_4_2() {
-; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_4_2:
+define void @s_shuffle_v3p0_v4p0__7_3_3() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_3_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -11350,14 +12258,16 @@ define void @s_shuffle_v3p0_v4p0__7_4_2() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s10
 ; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s10, s4
-; GFX900-NEXT:    s_mov_b32 s11, s5
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_4_2:
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_3_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -11368,14 +12278,16 @@ define void @s_shuffle_v3p0_v4p0__7_4_2() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s10
 ; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s10, s4
-; GFX90A-NEXT:    s_mov_b32 s11, s5
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_4_2:
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_3_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -11384,59 +12296,58 @@ define void @s_shuffle_v3p0_v4p0__7_4_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 4, i32 2>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 3, i32 3>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3p0_v4p0__7_5_2() {
-; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_5_2:
+define void @s_shuffle_v3p0_v4p0__7_u_3() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_u_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s14
-; GFX900-NEXT:    s_mov_b32 s9, s15
-; GFX900-NEXT:    s_mov_b32 s12, s16
-; GFX900-NEXT:    s_mov_b32 s13, s17
+; GFX900-NEXT:    s_mov_b32 s8, s18
+; GFX900-NEXT:    s_mov_b32 s9, s19
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_5_2:
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_u_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s14
-; GFX90A-NEXT:    s_mov_b32 s9, s15
-; GFX90A-NEXT:    s_mov_b32 s12, s16
-; GFX90A-NEXT:    s_mov_b32 s13, s17
+; GFX90A-NEXT:    s_mov_b32 s8, s18
+; GFX90A-NEXT:    s_mov_b32 s9, s19
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_5_2:
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_u_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -11445,59 +12356,61 @@ define void @s_shuffle_v3p0_v4p0__7_5_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s12, s4
-; GFX942-NEXT:    s_mov_b32 s13, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 5, i32 2>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 poison, i32 3>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3p0_v4p0__7_6_2() {
-; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_6_2:
+define void @s_shuffle_v3p0_v4p0__7_0_3() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_0_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[16:23]
+; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s22
-; GFX900-NEXT:    s_mov_b32 s9, s23
-; GFX900-NEXT:    s_mov_b32 s10, s20
-; GFX900-NEXT:    s_mov_b32 s11, s21
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s12, s18
+; GFX900-NEXT:    s_mov_b32 s13, s19
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_6_2:
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_0_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[16:23]
+; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s22
-; GFX90A-NEXT:    s_mov_b32 s9, s23
-; GFX90A-NEXT:    s_mov_b32 s10, s20
-; GFX90A-NEXT:    s_mov_b32 s11, s21
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s12, s18
+; GFX90A-NEXT:    s_mov_b32 s13, s19
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_6_2:
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_0_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -11506,30 +12419,32 @@ define void @s_shuffle_v3p0_v4p0__7_6_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 6, i32 2>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 0, i32 3>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3p0_v4p0__u_3_3() {
-; GFX900-LABEL: s_shuffle_v3p0_v4p0__u_3_3:
+define void @s_shuffle_v3p0_v4p0__7_1_3() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_1_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s10, s14
-; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[16:23]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s22
+; GFX900-NEXT:    s_mov_b32 s9, s23
 ; GFX900-NEXT:    s_mov_b32 s12, s14
 ; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -11537,14 +12452,17 @@ define void @s_shuffle_v3p0_v4p0__u_3_3() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3p0_v4p0__u_3_3:
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_1_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s10, s14
-; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[16:23]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s22
+; GFX90A-NEXT:    s_mov_b32 s9, s23
 ; GFX90A-NEXT:    s_mov_b32 s12, s14
 ; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -11552,58 +12470,42 @@ define void @s_shuffle_v3p0_v4p0__u_3_3() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3p0_v4p0__u_3_3:
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_1_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> <i32 poison, i32 3, i32 3>
-  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v3p0_v4p0__0_3_3() {
-; GFX9-LABEL: s_shuffle_v3p0_v4p0__0_3_3:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s14
-; GFX9-NEXT:    s_mov_b32 s11, s15
-; GFX9-NEXT:    s_mov_b32 s12, s14
-; GFX9-NEXT:    s_mov_b32 s13, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> <i32 0, i32 3, i32 3>
+  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 1, i32 3>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3p0_v4p0__1_3_3() {
-; GFX900-LABEL: s_shuffle_v3p0_v4p0__1_3_3:
+define void @s_shuffle_v3p0_v4p0__7_2_3() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_2_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s10
 ; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s10, s14
-; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
 ; GFX900-NEXT:    s_mov_b32 s12, s14
 ; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -11611,16 +12513,19 @@ define void @s_shuffle_v3p0_v4p0__1_3_3() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3p0_v4p0__1_3_3:
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_2_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s10
 ; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s10, s14
-; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
 ; GFX90A-NEXT:    s_mov_b32 s12, s14
 ; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -11628,39 +12533,43 @@ define void @s_shuffle_v3p0_v4p0__1_3_3() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3p0_v4p0__1_3_3:
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_2_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> <i32 1, i32 3, i32 3>
+  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 2, i32 3>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3p0_v4p0__2_3_3() {
-; GFX900-LABEL: s_shuffle_v3p0_v4p0__2_3_3:
+define void @s_shuffle_v3p0_v4p0__7_4_3() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_4_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s12
-; GFX900-NEXT:    s_mov_b32 s9, s13
-; GFX900-NEXT:    s_mov_b32 s10, s14
-; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s10, s4
+; GFX900-NEXT:    s_mov_b32 s11, s5
 ; GFX900-NEXT:    s_mov_b32 s12, s14
 ; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -11668,16 +12577,19 @@ define void @s_shuffle_v3p0_v4p0__2_3_3() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3p0_v4p0__2_3_3:
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_4_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s12
-; GFX90A-NEXT:    s_mov_b32 s9, s13
-; GFX90A-NEXT:    s_mov_b32 s10, s14
-; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s10, s4
+; GFX90A-NEXT:    s_mov_b32 s11, s5
 ; GFX90A-NEXT:    s_mov_b32 s12, s14
 ; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -11685,94 +12597,102 @@ define void @s_shuffle_v3p0_v4p0__2_3_3() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3p0_v4p0__2_3_3:
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_4_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[12:19]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[18:19]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> <i32 2, i32 3, i32 3>
+  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 4, i32 3>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3p0_v4p0__3_3_3() {
-; GFX900-LABEL: s_shuffle_v3p0_v4p0__3_3_3:
+define void @s_shuffle_v3p0_v4p0__7_5_3() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_5_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s14
 ; GFX900-NEXT:    s_mov_b32 s9, s15
-; GFX900-NEXT:    s_mov_b32 s10, s14
-; GFX900-NEXT:    s_mov_b32 s11, s15
-; GFX900-NEXT:    s_mov_b32 s12, s14
-; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    s_mov_b32 s12, s18
+; GFX900-NEXT:    s_mov_b32 s13, s19
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3p0_v4p0__3_3_3:
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_5_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s14
 ; GFX90A-NEXT:    s_mov_b32 s9, s15
-; GFX90A-NEXT:    s_mov_b32 s10, s14
-; GFX90A-NEXT:    s_mov_b32 s11, s15
-; GFX90A-NEXT:    s_mov_b32 s12, s14
-; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s18
+; GFX90A-NEXT:    s_mov_b32 s13, s19
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3p0_v4p0__3_3_3:
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_5_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> <i32 3, i32 3, i32 3>
+  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 5, i32 3>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3p0_v4p0__4_3_3() {
-; GFX900-LABEL: s_shuffle_v3p0_v4p0__4_3_3:
+define void @s_shuffle_v3p0_v4p0__7_6_3() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_6_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s10, s14
-; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[16:23]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s22
+; GFX900-NEXT:    s_mov_b32 s9, s23
+; GFX900-NEXT:    s_mov_b32 s10, s20
+; GFX900-NEXT:    s_mov_b32 s11, s21
 ; GFX900-NEXT:    s_mov_b32 s12, s14
 ; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -11780,14 +12700,19 @@ define void @s_shuffle_v3p0_v4p0__4_3_3() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3p0_v4p0__4_3_3:
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_6_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s10, s14
-; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[16:23]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s22
+; GFX90A-NEXT:    s_mov_b32 s9, s23
+; GFX90A-NEXT:    s_mov_b32 s10, s20
+; GFX90A-NEXT:    s_mov_b32 s11, s21
 ; GFX90A-NEXT:    s_mov_b32 s12, s14
 ; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -11795,649 +12720,531 @@ define void @s_shuffle_v3p0_v4p0__4_3_3() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3p0_v4p0__4_3_3:
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_6_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> <i32 4, i32 3, i32 3>
+  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 6, i32 3>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3p0_v4p0__5_3_3() {
-; GFX900-LABEL: s_shuffle_v3p0_v4p0__5_3_3:
+define void @s_shuffle_v3p0_v4p0__u_4_4() {
+; GFX9-LABEL: s_shuffle_v3p0_v4p0__u_4_4:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; use s[8:13]
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> <i32 poison, i32 4, i32 4>
+  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v3p0_v4p0__0_4_4() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__0_4_4:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s6
-; GFX900-NEXT:    s_mov_b32 s9, s7
-; GFX900-NEXT:    s_mov_b32 s10, s14
-; GFX900-NEXT:    s_mov_b32 s11, s15
-; GFX900-NEXT:    s_mov_b32 s12, s14
-; GFX900-NEXT:    s_mov_b32 s13, s15
-; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3p0_v4p0__5_3_3:
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__0_4_4:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s6
-; GFX90A-NEXT:    s_mov_b32 s9, s7
-; GFX90A-NEXT:    s_mov_b32 s10, s14
-; GFX90A-NEXT:    s_mov_b32 s11, s15
-; GFX90A-NEXT:    s_mov_b32 s12, s14
-; GFX90A-NEXT:    s_mov_b32 s13, s15
-; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3p0_v4p0__5_3_3:
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__0_4_4:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
+; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 5, i32 3, i32 3>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> <i32 0, i32 4, i32 4>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3p0_v4p0__6_3_3() {
-; GFX900-LABEL: s_shuffle_v3p0_v4p0__6_3_3:
+define void @s_shuffle_v3p0_v4p0__1_4_4() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__1_4_4:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s10, s14
-; GFX900-NEXT:    s_mov_b32 s11, s15
-; GFX900-NEXT:    s_mov_b32 s12, s14
-; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    s_mov_b32 s8, s6
+; GFX900-NEXT:    s_mov_b32 s9, s7
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3p0_v4p0__6_3_3:
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__1_4_4:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s10, s14
-; GFX90A-NEXT:    s_mov_b32 s11, s15
-; GFX90A-NEXT:    s_mov_b32 s12, s14
-; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    s_mov_b32 s8, s6
+; GFX90A-NEXT:    s_mov_b32 s9, s7
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3p0_v4p0__6_3_3:
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__1_4_4:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[8:15]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 6, i32 3, i32 3>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> <i32 1, i32 4, i32 4>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3p0_v4p0__7_3_3() {
-; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_3_3:
+define void @s_shuffle_v3p0_v4p0__2_4_4() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__2_4_4:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s10, s14
-; GFX900-NEXT:    s_mov_b32 s11, s15
-; GFX900-NEXT:    s_mov_b32 s12, s14
-; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_3_3:
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__2_4_4:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s10, s14
-; GFX90A-NEXT:    s_mov_b32 s11, s15
-; GFX90A-NEXT:    s_mov_b32 s12, s14
-; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_3_3:
-; GFX942:       ; %bb.0:
-; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[8:15]
-; GFX942-NEXT:    ;;#ASMEND
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__2_4_4:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 3, i32 3>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> <i32 2, i32 4, i32 4>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3p0_v4p0__7_u_3() {
-; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_u_3:
+define void @s_shuffle_v3p0_v4p0__3_4_4() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__3_4_4:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s18
-; GFX900-NEXT:    s_mov_b32 s9, s19
-; GFX900-NEXT:    s_mov_b32 s12, s10
-; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_u_3:
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__3_4_4:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s18
-; GFX90A-NEXT:    s_mov_b32 s9, s19
-; GFX90A-NEXT:    s_mov_b32 s12, s10
-; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_u_3:
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__3_4_4:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[8:15]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 poison, i32 3>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> <i32 3, i32 4, i32 4>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3p0_v4p0__7_0_3() {
-; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_0_3:
+define void @s_shuffle_v3p0_v4p0__4_4_4() {
+; GFX9-LABEL: s_shuffle_v3p0_v4p0__4_4_4:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; use s[8:13]
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> <i32 4, i32 4, i32 4>
+  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v3p0_v4p0__5_4_4() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__5_4_4:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s10, s12
-; GFX900-NEXT:    s_mov_b32 s11, s13
-; GFX900-NEXT:    s_mov_b32 s12, s18
-; GFX900-NEXT:    s_mov_b32 s13, s19
+; GFX900-NEXT:    s_mov_b32 s8, s6
+; GFX900-NEXT:    s_mov_b32 s9, s7
+; GFX900-NEXT:    s_mov_b32 s10, s4
+; GFX900-NEXT:    s_mov_b32 s11, s5
+; GFX900-NEXT:    s_mov_b32 s12, s4
+; GFX900-NEXT:    s_mov_b32 s13, s5
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_0_3:
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__5_4_4:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s10, s12
-; GFX90A-NEXT:    s_mov_b32 s11, s13
-; GFX90A-NEXT:    s_mov_b32 s12, s18
-; GFX90A-NEXT:    s_mov_b32 s13, s19
+; GFX90A-NEXT:    s_mov_b32 s8, s6
+; GFX90A-NEXT:    s_mov_b32 s9, s7
+; GFX90A-NEXT:    s_mov_b32 s10, s4
+; GFX90A-NEXT:    s_mov_b32 s11, s5
+; GFX90A-NEXT:    s_mov_b32 s12, s4
+; GFX90A-NEXT:    s_mov_b32 s13, s5
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_0_3:
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__5_4_4:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[8:15]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 0, i32 3>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 5, i32 4, i32 4>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3p0_v4p0__7_1_3() {
-; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_1_3:
+define void @s_shuffle_v3p0_v4p0__6_4_4() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__6_4_4:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[16:23]
+; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s22
-; GFX900-NEXT:    s_mov_b32 s9, s23
-; GFX900-NEXT:    s_mov_b32 s12, s14
-; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    s_mov_b32 s10, s4
+; GFX900-NEXT:    s_mov_b32 s11, s5
+; GFX900-NEXT:    s_mov_b32 s12, s4
+; GFX900-NEXT:    s_mov_b32 s13, s5
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_1_3:
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__6_4_4:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[16:23]
+; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s22
-; GFX90A-NEXT:    s_mov_b32 s9, s23
-; GFX90A-NEXT:    s_mov_b32 s12, s14
-; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    s_mov_b32 s10, s4
+; GFX90A-NEXT:    s_mov_b32 s11, s5
+; GFX90A-NEXT:    s_mov_b32 s12, s4
+; GFX90A-NEXT:    s_mov_b32 s13, s5
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_1_3:
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__6_4_4:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[8:15]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 1, i32 3>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 6, i32 4, i32 4>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3p0_v4p0__7_2_3() {
-; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_2_3:
+define void @s_shuffle_v3p0_v4p0__7_4_4() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_4_4:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s10
 ; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s10, s12
-; GFX900-NEXT:    s_mov_b32 s11, s13
-; GFX900-NEXT:    s_mov_b32 s12, s14
-; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    s_mov_b32 s10, s4
+; GFX900-NEXT:    s_mov_b32 s11, s5
+; GFX900-NEXT:    s_mov_b32 s12, s4
+; GFX900-NEXT:    s_mov_b32 s13, s5
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_2_3:
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_4_4:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s10
 ; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s10, s12
-; GFX90A-NEXT:    s_mov_b32 s11, s13
-; GFX90A-NEXT:    s_mov_b32 s12, s14
-; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    s_mov_b32 s10, s4
+; GFX90A-NEXT:    s_mov_b32 s11, s5
+; GFX90A-NEXT:    s_mov_b32 s12, s4
+; GFX90A-NEXT:    s_mov_b32 s13, s5
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_2_3:
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_4_4:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[8:15]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 2, i32 3>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 4, i32 4>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3p0_v4p0__7_4_3() {
-; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_4_3:
+define void @s_shuffle_v3p0_v4p0__7_u_4() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_u_4:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s10
 ; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s10, s4
-; GFX900-NEXT:    s_mov_b32 s11, s5
-; GFX900-NEXT:    s_mov_b32 s12, s14
-; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    s_mov_b32 s12, s4
+; GFX900-NEXT:    s_mov_b32 s13, s5
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_4_3:
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_u_4:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s10
 ; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s10, s4
-; GFX90A-NEXT:    s_mov_b32 s11, s5
-; GFX90A-NEXT:    s_mov_b32 s12, s14
-; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s4
+; GFX90A-NEXT:    s_mov_b32 s13, s5
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_4_3:
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_u_4:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[12:19]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s18
-; GFX942-NEXT:    s_mov_b32 s9, s19
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 4, i32 3>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 poison, i32 4>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3p0_v4p0__7_5_3() {
-; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_5_3:
+define void @s_shuffle_v3p0_v4p0__7_0_4() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_0_4:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s14
-; GFX900-NEXT:    s_mov_b32 s9, s15
-; GFX900-NEXT:    s_mov_b32 s12, s18
-; GFX900-NEXT:    s_mov_b32 s13, s19
+; GFX900-NEXT:    s_mov_b32 s8, s18
+; GFX900-NEXT:    s_mov_b32 s9, s19
+; GFX900-NEXT:    s_mov_b32 s10, s4
+; GFX900-NEXT:    s_mov_b32 s11, s5
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_5_3:
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_0_4:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s14
-; GFX90A-NEXT:    s_mov_b32 s9, s15
-; GFX90A-NEXT:    s_mov_b32 s12, s18
-; GFX90A-NEXT:    s_mov_b32 s13, s19
+; GFX90A-NEXT:    s_mov_b32 s8, s18
+; GFX90A-NEXT:    s_mov_b32 s9, s19
+; GFX90A-NEXT:    s_mov_b32 s10, s4
+; GFX90A-NEXT:    s_mov_b32 s11, s5
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_5_3:
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_0_4:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 5, i32 3>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 0, i32 4>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3p0_v4p0__7_6_3() {
-; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_6_3:
+define void @s_shuffle_v3p0_v4p0__7_1_4() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_1_4:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[16:23]
+; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s22
-; GFX900-NEXT:    s_mov_b32 s9, s23
-; GFX900-NEXT:    s_mov_b32 s10, s20
-; GFX900-NEXT:    s_mov_b32 s11, s21
-; GFX900-NEXT:    s_mov_b32 s12, s14
-; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    s_mov_b32 s8, s18
+; GFX900-NEXT:    s_mov_b32 s9, s19
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_6_3:
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_1_4:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[16:23]
+; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s22
-; GFX90A-NEXT:    s_mov_b32 s9, s23
-; GFX90A-NEXT:    s_mov_b32 s10, s20
-; GFX90A-NEXT:    s_mov_b32 s11, s21
-; GFX90A-NEXT:    s_mov_b32 s12, s14
-; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    s_mov_b32 s8, s18
+; GFX90A-NEXT:    s_mov_b32 s9, s19
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_6_3:
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_1_4:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -12446,831 +13253,783 @@ define void @s_shuffle_v3p0_v4p0__7_6_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 6, i32 3>
-  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v3p0_v4p0__u_4_4() {
-; GFX9-LABEL: s_shuffle_v3p0_v4p0__u_4_4:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> <i32 poison, i32 4, i32 4>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 1, i32 4>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3p0_v4p0__0_4_4() {
-; GFX900-LABEL: s_shuffle_v3p0_v4p0__0_4_4:
+define void @s_shuffle_v3p0_v4p0__7_2_4() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_2_4:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s12, s4
+; GFX900-NEXT:    s_mov_b32 s13, s5
+; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3p0_v4p0__0_4_4:
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_2_4:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s12, s4
+; GFX90A-NEXT:    s_mov_b32 s13, s5
+; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3p0_v4p0__0_4_4:
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_2_4:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[12:19]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[18:19]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> <i32 0, i32 4, i32 4>
+  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 2, i32 4>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3p0_v4p0__1_4_4() {
-; GFX900-LABEL: s_shuffle_v3p0_v4p0__1_4_4:
+define void @s_shuffle_v3p0_v4p0__7_3_4() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_3_4:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s6
-; GFX900-NEXT:    s_mov_b32 s9, s7
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s18
+; GFX900-NEXT:    s_mov_b32 s9, s19
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3p0_v4p0__1_4_4:
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_3_4:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s6
-; GFX90A-NEXT:    s_mov_b32 s9, s7
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s18
+; GFX90A-NEXT:    s_mov_b32 s9, s19
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3p0_v4p0__1_4_4:
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_3_4:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[12:19]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[18:19]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> <i32 1, i32 4, i32 4>
+  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 3, i32 4>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3p0_v4p0__2_4_4() {
-; GFX900-LABEL: s_shuffle_v3p0_v4p0__2_4_4:
+define void @s_shuffle_v3p0_v4p0__7_5_4() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_5_4:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s10, s6
+; GFX900-NEXT:    s_mov_b32 s11, s7
+; GFX900-NEXT:    s_mov_b32 s12, s4
+; GFX900-NEXT:    s_mov_b32 s13, s5
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3p0_v4p0__2_4_4:
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_5_4:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s10, s6
+; GFX90A-NEXT:    s_mov_b32 s11, s7
+; GFX90A-NEXT:    s_mov_b32 s12, s4
+; GFX90A-NEXT:    s_mov_b32 s13, s5
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3p0_v4p0__2_4_4:
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_5_4:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> <i32 2, i32 4, i32 4>
+  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 5, i32 4>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3p0_v4p0__3_4_4() {
-; GFX900-LABEL: s_shuffle_v3p0_v4p0__3_4_4:
+define void @s_shuffle_v3p0_v4p0__7_6_4() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_6_4:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s8, s18
+; GFX900-NEXT:    s_mov_b32 s9, s19
+; GFX900-NEXT:    s_mov_b32 s10, s16
+; GFX900-NEXT:    s_mov_b32 s11, s17
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3p0_v4p0__3_4_4:
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_6_4:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s8, s18
+; GFX90A-NEXT:    s_mov_b32 s9, s19
+; GFX90A-NEXT:    s_mov_b32 s10, s16
+; GFX90A-NEXT:    s_mov_b32 s11, s17
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3p0_v4p0__3_4_4:
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_6_4:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> <i32 3, i32 4, i32 4>
-  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v3p0_v4p0__4_4_4() {
-; GFX9-LABEL: s_shuffle_v3p0_v4p0__4_4_4:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> <i32 4, i32 4, i32 4>
+  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 6, i32 4>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3p0_v4p0__5_4_4() {
-; GFX900-LABEL: s_shuffle_v3p0_v4p0__5_4_4:
+define void @s_shuffle_v3p0_v4p0__u_5_5() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__u_5_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s6
-; GFX900-NEXT:    s_mov_b32 s9, s7
-; GFX900-NEXT:    s_mov_b32 s10, s4
-; GFX900-NEXT:    s_mov_b32 s11, s5
-; GFX900-NEXT:    s_mov_b32 s12, s4
-; GFX900-NEXT:    s_mov_b32 s13, s5
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3p0_v4p0__5_4_4:
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__u_5_5:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s6
-; GFX90A-NEXT:    s_mov_b32 s9, s7
-; GFX90A-NEXT:    s_mov_b32 s10, s4
-; GFX90A-NEXT:    s_mov_b32 s11, s5
-; GFX90A-NEXT:    s_mov_b32 s12, s4
-; GFX90A-NEXT:    s_mov_b32 s13, s5
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3p0_v4p0__5_4_4:
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__u_5_5:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 5, i32 4, i32 4>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 poison, i32 5, i32 5>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3p0_v4p0__6_4_4() {
-; GFX900-LABEL: s_shuffle_v3p0_v4p0__6_4_4:
+define void @s_shuffle_v3p0_v4p0__0_5_5() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__0_5_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s10, s4
-; GFX900-NEXT:    s_mov_b32 s11, s5
-; GFX900-NEXT:    s_mov_b32 s12, s4
-; GFX900-NEXT:    s_mov_b32 s13, s5
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3p0_v4p0__6_4_4:
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__0_5_5:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s10, s4
-; GFX90A-NEXT:    s_mov_b32 s11, s5
-; GFX90A-NEXT:    s_mov_b32 s12, s4
-; GFX90A-NEXT:    s_mov_b32 s13, s5
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3p0_v4p0__6_4_4:
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__0_5_5:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 6, i32 4, i32 4>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 0, i32 5, i32 5>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3p0_v4p0__7_4_4() {
-; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_4_4:
+define void @s_shuffle_v3p0_v4p0__1_5_5() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__1_5_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s10, s4
-; GFX900-NEXT:    s_mov_b32 s11, s5
-; GFX900-NEXT:    s_mov_b32 s12, s4
-; GFX900-NEXT:    s_mov_b32 s13, s5
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s6
+; GFX900-NEXT:    s_mov_b32 s9, s7
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_4_4:
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__1_5_5:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s10, s4
-; GFX90A-NEXT:    s_mov_b32 s11, s5
-; GFX90A-NEXT:    s_mov_b32 s12, s4
-; GFX90A-NEXT:    s_mov_b32 s13, s5
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s6
+; GFX90A-NEXT:    s_mov_b32 s9, s7
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_4_4:
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__1_5_5:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 4, i32 4>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 1, i32 5, i32 5>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3p0_v4p0__7_u_4() {
-; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_u_4:
+define void @s_shuffle_v3p0_v4p0__2_5_5() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__2_5_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s12, s4
-; GFX900-NEXT:    s_mov_b32 s13, s5
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s16
+; GFX900-NEXT:    s_mov_b32 s9, s17
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_u_4:
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__2_5_5:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s12, s4
-; GFX90A-NEXT:    s_mov_b32 s13, s5
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s16
+; GFX90A-NEXT:    s_mov_b32 s9, s17
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_u_4:
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__2_5_5:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 poison, i32 4>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 2, i32 5, i32 5>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3p0_v4p0__7_0_4() {
-; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_0_4:
+define void @s_shuffle_v3p0_v4p0__3_5_5() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__3_5_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s18
 ; GFX900-NEXT:    s_mov_b32 s9, s19
-; GFX900-NEXT:    s_mov_b32 s10, s4
-; GFX900-NEXT:    s_mov_b32 s11, s5
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_0_4:
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__3_5_5:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s18
 ; GFX90A-NEXT:    s_mov_b32 s9, s19
-; GFX90A-NEXT:    s_mov_b32 s10, s4
-; GFX90A-NEXT:    s_mov_b32 s11, s5
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_0_4:
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__3_5_5:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[4:11]
+; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s4
-; GFX942-NEXT:    s_mov_b32 s13, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 0, i32 4>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 3, i32 5, i32 5>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3p0_v4p0__7_1_4() {
-; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_1_4:
+define void @s_shuffle_v3p0_v4p0__4_5_5() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__4_5_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s18
-; GFX900-NEXT:    s_mov_b32 s9, s19
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_1_4:
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__4_5_5:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s18
-; GFX90A-NEXT:    s_mov_b32 s9, s19
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_1_4:
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__4_5_5:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 1, i32 4>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 4, i32 5, i32 5>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3p0_v4p0__7_2_4() {
-; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_2_4:
+define void @s_shuffle_v3p0_v4p0__5_5_5() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__5_5_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
-; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s10
 ; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s10, s12
-; GFX900-NEXT:    s_mov_b32 s11, s13
-; GFX900-NEXT:    s_mov_b32 s12, s4
-; GFX900-NEXT:    s_mov_b32 s13, s5
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_2_4:
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__5_5_5:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
-; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s10
 ; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s10, s12
-; GFX90A-NEXT:    s_mov_b32 s11, s13
-; GFX90A-NEXT:    s_mov_b32 s12, s4
-; GFX90A-NEXT:    s_mov_b32 s13, s5
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_2_4:
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__5_5_5:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[12:19]
+; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s18
-; GFX942-NEXT:    s_mov_b32 s9, s19
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 2, i32 4>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 5, i32 5, i32 5>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3p0_v4p0__7_3_4() {
-; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_3_4:
+define void @s_shuffle_v3p0_v4p0__6_5_5() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__6_5_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s18
-; GFX900-NEXT:    s_mov_b32 s9, s19
+; GFX900-NEXT:    s_mov_b32 s8, s12
+; GFX900-NEXT:    s_mov_b32 s9, s13
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_3_4:
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__6_5_5:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s18
-; GFX90A-NEXT:    s_mov_b32 s9, s19
+; GFX90A-NEXT:    s_mov_b32 s8, s12
+; GFX90A-NEXT:    s_mov_b32 s9, s13
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_3_4:
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__6_5_5:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[12:19]
+; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s18
-; GFX942-NEXT:    s_mov_b32 s9, s19
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 3, i32 4>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 6, i32 5, i32 5>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3p0_v4p0__7_5_4() {
-; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_5_4:
+define void @s_shuffle_v3p0_v4p0__7_5_5() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_5_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s10, s6
-; GFX900-NEXT:    s_mov_b32 s11, s7
-; GFX900-NEXT:    s_mov_b32 s12, s4
-; GFX900-NEXT:    s_mov_b32 s13, s5
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_5_4:
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_5_5:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s10, s6
-; GFX90A-NEXT:    s_mov_b32 s11, s7
-; GFX90A-NEXT:    s_mov_b32 s12, s4
-; GFX90A-NEXT:    s_mov_b32 s13, s5
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_5_4:
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_5_5:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s2
-; GFX942-NEXT:    s_mov_b32 s11, s3
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 5, i32 4>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 5, i32 5>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3p0_v4p0__7_6_4() {
-; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_6_4:
+define void @s_shuffle_v3p0_v4p0__7_u_5() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_u_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s18
-; GFX900-NEXT:    s_mov_b32 s9, s19
-; GFX900-NEXT:    s_mov_b32 s10, s16
-; GFX900-NEXT:    s_mov_b32 s11, s17
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s12, s6
+; GFX900-NEXT:    s_mov_b32 s13, s7
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_6_4:
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_u_5:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s18
-; GFX90A-NEXT:    s_mov_b32 s9, s19
-; GFX90A-NEXT:    s_mov_b32 s10, s16
-; GFX90A-NEXT:    s_mov_b32 s11, s17
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s12, s6
+; GFX90A-NEXT:    s_mov_b32 s13, s7
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_6_4:
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_u_5:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 6, i32 4>
-  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v3p0_v4p0__u_5_5() {
-; GFX9-LABEL: s_shuffle_v3p0_v4p0__u_5_5:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 poison, i32 5, i32 5>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 poison, i32 5>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3p0_v4p0__0_5_5() {
-; GFX900-LABEL: s_shuffle_v3p0_v4p0__0_5_5:
+define void @s_shuffle_v3p0_v4p0__7_0_5() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_0_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s10, s14
-; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    s_mov_b32 s8, s18
+; GFX900-NEXT:    s_mov_b32 s9, s19
+; GFX900-NEXT:    s_mov_b32 s10, s4
+; GFX900-NEXT:    s_mov_b32 s11, s5
 ; GFX900-NEXT:    s_mov_b32 s12, s14
 ; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -13278,17 +14037,19 @@ define void @s_shuffle_v3p0_v4p0__0_5_5() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3p0_v4p0__0_5_5:
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_0_5:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s10, s14
-; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    s_mov_b32 s8, s18
+; GFX90A-NEXT:    s_mov_b32 s9, s19
+; GFX90A-NEXT:    s_mov_b32 s10, s4
+; GFX90A-NEXT:    s_mov_b32 s11, s5
 ; GFX90A-NEXT:    s_mov_b32 s12, s14
 ; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -13296,68 +14057,68 @@ define void @s_shuffle_v3p0_v4p0__0_5_5() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3p0_v4p0__0_5_5:
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_0_5:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s2
-; GFX942-NEXT:    s_mov_b32 s11, s3
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 0, i32 5, i32 5>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 0, i32 5>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3p0_v4p0__1_5_5() {
-; GFX900-LABEL: s_shuffle_v3p0_v4p0__1_5_5:
+define void @s_shuffle_v3p0_v4p0__7_1_5() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_1_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s6
-; GFX900-NEXT:    s_mov_b32 s9, s7
-; GFX900-NEXT:    s_mov_b32 s12, s10
-; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    s_mov_b32 s8, s18
+; GFX900-NEXT:    s_mov_b32 s9, s19
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3p0_v4p0__1_5_5:
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_1_5:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s6
-; GFX90A-NEXT:    s_mov_b32 s9, s7
-; GFX90A-NEXT:    s_mov_b32 s12, s10
-; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    s_mov_b32 s8, s18
+; GFX90A-NEXT:    s_mov_b32 s9, s19
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3p0_v4p0__1_5_5:
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_1_5:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -13366,231 +14127,145 @@ define void @s_shuffle_v3p0_v4p0__1_5_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s12, s10
-; GFX942-NEXT:    s_mov_b32 s13, s11
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 1, i32 5, i32 5>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 1, i32 5>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3p0_v4p0__2_5_5() {
-; GFX900-LABEL: s_shuffle_v3p0_v4p0__2_5_5:
+define void @s_shuffle_v3p0_v4p0__7_2_5() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_2_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s16
-; GFX900-NEXT:    s_mov_b32 s9, s17
-; GFX900-NEXT:    s_mov_b32 s12, s10
-; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s12, s6
+; GFX900-NEXT:    s_mov_b32 s13, s7
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3p0_v4p0__2_5_5:
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_2_5:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s16
-; GFX90A-NEXT:    s_mov_b32 s9, s17
-; GFX90A-NEXT:    s_mov_b32 s12, s10
-; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s12, s6
+; GFX90A-NEXT:    s_mov_b32 s13, s7
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3p0_v4p0__2_5_5:
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_2_5:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ; def s[12:19]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s12, s10
-; GFX942-NEXT:    s_mov_b32 s13, s11
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[18:19]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 2, i32 5, i32 5>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 2, i32 5>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3p0_v4p0__3_5_5() {
-; GFX900-LABEL: s_shuffle_v3p0_v4p0__3_5_5:
+define void @s_shuffle_v3p0_v4p0__7_3_5() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_3_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s18
 ; GFX900-NEXT:    s_mov_b32 s9, s19
-; GFX900-NEXT:    s_mov_b32 s12, s10
-; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3p0_v4p0__3_5_5:
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_3_5:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s18
 ; GFX90A-NEXT:    s_mov_b32 s9, s19
-; GFX90A-NEXT:    s_mov_b32 s12, s10
-; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3p0_v4p0__3_5_5:
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_3_5:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ; def s[12:19]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s12, s10
-; GFX942-NEXT:    s_mov_b32 s13, s11
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[18:19]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 3, i32 5, i32 5>
-  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v3p0_v4p0__4_5_5() {
-; GFX9-LABEL: s_shuffle_v3p0_v4p0__4_5_5:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 4, i32 5, i32 5>
-  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v3p0_v4p0__5_5_5() {
-; GFX9-LABEL: s_shuffle_v3p0_v4p0__5_5_5:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 5, i32 5, i32 5>
-  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v3p0_v4p0__6_5_5() {
-; GFX9-LABEL: s_shuffle_v3p0_v4p0__6_5_5:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s12
-; GFX9-NEXT:    s_mov_b32 s9, s13
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 6, i32 5, i32 5>
-  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v3p0_v4p0__7_5_5() {
-; GFX9-LABEL: s_shuffle_v3p0_v4p0__7_5_5:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 5, i32 5>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 3, i32 5>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3p0_v4p0__7_u_5() {
-; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_u_5:
+define void @s_shuffle_v3p0_v4p0__7_4_5() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_4_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -13598,6 +14273,8 @@ define void @s_shuffle_v3p0_v4p0__7_u_5() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s10
 ; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s10, s4
+; GFX900-NEXT:    s_mov_b32 s11, s5
 ; GFX900-NEXT:    s_mov_b32 s12, s6
 ; GFX900-NEXT:    s_mov_b32 s13, s7
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -13605,7 +14282,7 @@ define void @s_shuffle_v3p0_v4p0__7_u_5() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_u_5:
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_4_5:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -13613,6 +14290,8 @@ define void @s_shuffle_v3p0_v4p0__7_u_5() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s10
 ; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s10, s4
+; GFX90A-NEXT:    s_mov_b32 s11, s5
 ; GFX90A-NEXT:    s_mov_b32 s12, s6
 ; GFX90A-NEXT:    s_mov_b32 s13, s7
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -13620,41 +14299,37 @@ define void @s_shuffle_v3p0_v4p0__7_u_5() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_u_5:
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_4_5:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 poison, i32 5>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 4, i32 5>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3p0_v4p0__7_0_5() {
-; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_0_5:
+define void @s_shuffle_v3p0_v4p0__7_6_5() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_6_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s18
 ; GFX900-NEXT:    s_mov_b32 s9, s19
-; GFX900-NEXT:    s_mov_b32 s10, s4
-; GFX900-NEXT:    s_mov_b32 s11, s5
+; GFX900-NEXT:    s_mov_b32 s10, s16
+; GFX900-NEXT:    s_mov_b32 s11, s17
 ; GFX900-NEXT:    s_mov_b32 s12, s14
 ; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -13662,19 +14337,16 @@ define void @s_shuffle_v3p0_v4p0__7_0_5() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_0_5:
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_6_5:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s18
 ; GFX90A-NEXT:    s_mov_b32 s9, s19
-; GFX90A-NEXT:    s_mov_b32 s10, s4
-; GFX90A-NEXT:    s_mov_b32 s11, s5
+; GFX90A-NEXT:    s_mov_b32 s10, s16
+; GFX90A-NEXT:    s_mov_b32 s11, s17
 ; GFX90A-NEXT:    s_mov_b32 s12, s14
 ; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -13682,35 +14354,73 @@ define void @s_shuffle_v3p0_v4p0__7_0_5() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_0_5:
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_6_5:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
+  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 6, i32 5>
+  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v3p0_v4p0__u_6_6() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__u_6_6:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__u_6_6:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__u_6_6:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[4:11]
+; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 0, i32 5>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 poison, i32 6, i32 6>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3p0_v4p0__7_1_5() {
-; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_1_5:
+define void @s_shuffle_v3p0_v4p0__0_6_6() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__0_6_6:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -13719,16 +14429,16 @@ define void @s_shuffle_v3p0_v4p0__7_1_5() {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s18
-; GFX900-NEXT:    s_mov_b32 s9, s19
-; GFX900-NEXT:    s_mov_b32 s12, s14
-; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    s_mov_b32 s10, s16
+; GFX900-NEXT:    s_mov_b32 s11, s17
+; GFX900-NEXT:    s_mov_b32 s12, s16
+; GFX900-NEXT:    s_mov_b32 s13, s17
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_1_5:
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__0_6_6:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -13737,16 +14447,16 @@ define void @s_shuffle_v3p0_v4p0__7_1_5() {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s18
-; GFX90A-NEXT:    s_mov_b32 s9, s19
-; GFX90A-NEXT:    s_mov_b32 s12, s14
-; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    s_mov_b32 s10, s16
+; GFX90A-NEXT:    s_mov_b32 s11, s17
+; GFX90A-NEXT:    s_mov_b32 s12, s16
+; GFX90A-NEXT:    s_mov_b32 s13, s17
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_1_5:
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__0_6_6:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -13755,360 +14465,300 @@ define void @s_shuffle_v3p0_v4p0__7_1_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 1, i32 5>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 0, i32 6, i32 6>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3p0_v4p0__7_2_5() {
-; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_2_5:
+define void @s_shuffle_v3p0_v4p0__1_6_6() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__1_6_6:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s8, s6
+; GFX900-NEXT:    s_mov_b32 s9, s7
 ; GFX900-NEXT:    s_mov_b32 s10, s12
 ; GFX900-NEXT:    s_mov_b32 s11, s13
-; GFX900-NEXT:    s_mov_b32 s12, s6
-; GFX900-NEXT:    s_mov_b32 s13, s7
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_2_5:
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__1_6_6:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s8, s6
+; GFX90A-NEXT:    s_mov_b32 s9, s7
 ; GFX90A-NEXT:    s_mov_b32 s10, s12
 ; GFX90A-NEXT:    s_mov_b32 s11, s13
-; GFX90A-NEXT:    s_mov_b32 s12, s6
-; GFX90A-NEXT:    s_mov_b32 s13, s7
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_2_5:
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__1_6_6:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[12:19]
+; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s18
-; GFX942-NEXT:    s_mov_b32 s9, s19
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 2, i32 5>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 1, i32 6, i32 6>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3p0_v4p0__7_3_5() {
-; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_3_5:
+define void @s_shuffle_v3p0_v4p0__2_6_6() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__2_6_6:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s18
-; GFX900-NEXT:    s_mov_b32 s9, s19
-; GFX900-NEXT:    s_mov_b32 s12, s14
-; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    s_mov_b32 s8, s16
+; GFX900-NEXT:    s_mov_b32 s9, s17
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_3_5:
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__2_6_6:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s18
-; GFX90A-NEXT:    s_mov_b32 s9, s19
-; GFX90A-NEXT:    s_mov_b32 s12, s14
-; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    s_mov_b32 s8, s16
+; GFX90A-NEXT:    s_mov_b32 s9, s17
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_3_5:
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__2_6_6:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[12:19]
+; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s18
-; GFX942-NEXT:    s_mov_b32 s9, s19
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 3, i32 5>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 2, i32 6, i32 6>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3p0_v4p0__7_4_5() {
-; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_4_5:
+define void @s_shuffle_v3p0_v4p0__3_6_6() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__3_6_6:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s10, s4
-; GFX900-NEXT:    s_mov_b32 s11, s5
-; GFX900-NEXT:    s_mov_b32 s12, s6
-; GFX900-NEXT:    s_mov_b32 s13, s7
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s18
+; GFX900-NEXT:    s_mov_b32 s9, s19
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_4_5:
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__3_6_6:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s10, s4
-; GFX90A-NEXT:    s_mov_b32 s11, s5
-; GFX90A-NEXT:    s_mov_b32 s12, s6
-; GFX90A-NEXT:    s_mov_b32 s13, s7
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s18
+; GFX90A-NEXT:    s_mov_b32 s9, s19
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_4_5:
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__3_6_6:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 4, i32 5>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 3, i32 6, i32 6>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3p0_v4p0__7_6_5() {
-; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_6_5:
+define void @s_shuffle_v3p0_v4p0__4_6_6() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__4_6_6:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s18
-; GFX900-NEXT:    s_mov_b32 s9, s19
-; GFX900-NEXT:    s_mov_b32 s10, s16
-; GFX900-NEXT:    s_mov_b32 s11, s17
-; GFX900-NEXT:    s_mov_b32 s12, s14
-; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_6_5:
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__4_6_6:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s18
-; GFX90A-NEXT:    s_mov_b32 s9, s19
-; GFX90A-NEXT:    s_mov_b32 s10, s16
-; GFX90A-NEXT:    s_mov_b32 s11, s17
-; GFX90A-NEXT:    s_mov_b32 s12, s14
-; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_6_5:
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__4_6_6:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 6, i32 5>
-  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v3p0_v4p0__u_6_6() {
-; GFX9-LABEL: s_shuffle_v3p0_v4p0__u_6_6:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 poison, i32 6, i32 6>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 4, i32 6, i32 6>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3p0_v4p0__0_6_6() {
-; GFX900-LABEL: s_shuffle_v3p0_v4p0__0_6_6:
+define void @s_shuffle_v3p0_v4p0__5_6_6() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__5_6_6:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s10, s16
-; GFX900-NEXT:    s_mov_b32 s11, s17
-; GFX900-NEXT:    s_mov_b32 s12, s16
-; GFX900-NEXT:    s_mov_b32 s13, s17
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3p0_v4p0__0_6_6:
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__5_6_6:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s10, s16
-; GFX90A-NEXT:    s_mov_b32 s11, s17
-; GFX90A-NEXT:    s_mov_b32 s12, s16
-; GFX90A-NEXT:    s_mov_b32 s13, s17
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3p0_v4p0__0_6_6:
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__5_6_6:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s4
-; GFX942-NEXT:    s_mov_b32 s13, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 0, i32 6, i32 6>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 5, i32 6, i32 6>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3p0_v4p0__1_6_6() {
-; GFX900-LABEL: s_shuffle_v3p0_v4p0__1_6_6:
+define void @s_shuffle_v3p0_v4p0__6_6_6() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__6_6_6:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s6
-; GFX900-NEXT:    s_mov_b32 s9, s7
+; GFX900-NEXT:    s_mov_b32 s8, s12
+; GFX900-NEXT:    s_mov_b32 s9, s13
 ; GFX900-NEXT:    s_mov_b32 s10, s12
 ; GFX900-NEXT:    s_mov_b32 s11, s13
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -14116,17 +14766,14 @@ define void @s_shuffle_v3p0_v4p0__1_6_6() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3p0_v4p0__1_6_6:
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__6_6_6:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s6
-; GFX90A-NEXT:    s_mov_b32 s9, s7
+; GFX90A-NEXT:    s_mov_b32 s8, s12
+; GFX90A-NEXT:    s_mov_b32 s9, s13
 ; GFX90A-NEXT:    s_mov_b32 s10, s12
 ; GFX90A-NEXT:    s_mov_b32 s11, s13
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -14134,42 +14781,34 @@ define void @s_shuffle_v3p0_v4p0__1_6_6() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3p0_v4p0__1_6_6:
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__6_6_6:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 1, i32 6, i32 6>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 6, i32 6, i32 6>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3p0_v4p0__2_6_6() {
-; GFX900-LABEL: s_shuffle_v3p0_v4p0__2_6_6:
+define void @s_shuffle_v3p0_v4p0__7_6_6() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_6_6:
 ; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
-; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s16
-; GFX900-NEXT:    s_mov_b32 s9, s17
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
 ; GFX900-NEXT:    s_mov_b32 s10, s12
 ; GFX900-NEXT:    s_mov_b32 s11, s13
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -14177,17 +14816,14 @@ define void @s_shuffle_v3p0_v4p0__2_6_6() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3p0_v4p0__2_6_6:
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_6_6:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s16
-; GFX90A-NEXT:    s_mov_b32 s9, s17
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
 ; GFX90A-NEXT:    s_mov_b32 s10, s12
 ; GFX90A-NEXT:    s_mov_b32 s11, s13
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -14195,190 +14831,63 @@ define void @s_shuffle_v3p0_v4p0__2_6_6() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3p0_v4p0__2_6_6:
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_6_6:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 2, i32 6, i32 6>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 6, i32 6>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3p0_v4p0__3_6_6() {
-; GFX900-LABEL: s_shuffle_v3p0_v4p0__3_6_6:
+define void @s_shuffle_v3p0_v4p0__7_u_6() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_u_6:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s18
-; GFX900-NEXT:    s_mov_b32 s9, s19
-; GFX900-NEXT:    s_mov_b32 s10, s12
-; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3p0_v4p0__3_6_6:
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_u_6:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s18
-; GFX90A-NEXT:    s_mov_b32 s9, s19
-; GFX90A-NEXT:    s_mov_b32 s10, s12
-; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3p0_v4p0__3_6_6:
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_u_6:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 3, i32 6, i32 6>
-  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v3p0_v4p0__4_6_6() {
-; GFX9-LABEL: s_shuffle_v3p0_v4p0__4_6_6:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 4, i32 6, i32 6>
-  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v3p0_v4p0__5_6_6() {
-; GFX9-LABEL: s_shuffle_v3p0_v4p0__5_6_6:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 5, i32 6, i32 6>
-  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v3p0_v4p0__6_6_6() {
-; GFX9-LABEL: s_shuffle_v3p0_v4p0__6_6_6:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s12
-; GFX9-NEXT:    s_mov_b32 s9, s13
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 6, i32 6, i32 6>
-  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v3p0_v4p0__7_6_6() {
-; GFX9-LABEL: s_shuffle_v3p0_v4p0__7_6_6:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 6, i32 6>
-  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v3p0_v4p0__7_u_6() {
-; GFX9-LABEL: s_shuffle_v3p0_v4p0__7_u_6:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 poison, i32 6>
@@ -14432,10 +14941,8 @@ define void @s_shuffle_v3p0_v4p0__7_0_6() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14493,10 +15000,8 @@ define void @s_shuffle_v3p0_v4p0__7_1_6() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s12, s4
-; GFX942-NEXT:    s_mov_b32 s13, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14554,10 +15059,8 @@ define void @s_shuffle_v3p0_v4p0__7_2_6() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14615,10 +15118,8 @@ define void @s_shuffle_v3p0_v4p0__7_3_6() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14671,12 +15172,9 @@ define void @s_shuffle_v3p0_v4p0__7_4_6() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s4
-; GFX942-NEXT:    s_mov_b32 s13, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14689,18 +15187,43 @@ define void @s_shuffle_v3p0_v4p0__7_4_6() {
 }
 
 define void @s_shuffle_v3p0_v4p0__7_5_6() {
-; GFX9-LABEL: s_shuffle_v3p0_v4p0__7_5_6:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_5_6:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_5_6:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_5_6:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 5, i32 6>
@@ -14745,10 +15268,8 @@ define void @s_shuffle_v3p0_v4p0__u_7_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14806,10 +15327,8 @@ define void @s_shuffle_v3p0_v4p0__0_7_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14871,12 +15390,9 @@ define void @s_shuffle_v3p0_v4p0__1_7_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14934,12 +15450,9 @@ define void @s_shuffle_v3p0_v4p0__2_7_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -15001,12 +15514,9 @@ define void @s_shuffle_v3p0_v4p0__3_7_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -15019,20 +15529,48 @@ define void @s_shuffle_v3p0_v4p0__3_7_7() {
 }
 
 define void @s_shuffle_v3p0_v4p0__4_7_7() {
-; GFX9-LABEL: s_shuffle_v3p0_v4p0__4_7_7:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s14
-; GFX9-NEXT:    s_mov_b32 s11, s15
-; GFX9-NEXT:    s_mov_b32 s12, s14
-; GFX9-NEXT:    s_mov_b32 s13, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__4_7_7:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__4_7_7:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__4_7_7:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 4, i32 7, i32 7>
@@ -15081,12 +15619,9 @@ define void @s_shuffle_v3p0_v4p0__5_7_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -15139,12 +15674,9 @@ define void @s_shuffle_v3p0_v4p0__6_7_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -15193,10 +15725,8 @@ define void @s_shuffle_v3p0_v4p0__7_u_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -15258,12 +15788,9 @@ define void @s_shuffle_v3p0_v4p0__7_0_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -15321,10 +15848,8 @@ define void @s_shuffle_v3p0_v4p0__7_1_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -15386,12 +15911,9 @@ define void @s_shuffle_v3p0_v4p0__7_2_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -15449,12 +15971,9 @@ define void @s_shuffle_v3p0_v4p0__7_3_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -15507,12 +16026,9 @@ define void @s_shuffle_v3p0_v4p0__7_4_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -15525,20 +16041,48 @@ define void @s_shuffle_v3p0_v4p0__7_4_7() {
 }
 
 define void @s_shuffle_v3p0_v4p0__7_5_7() {
-; GFX9-LABEL: s_shuffle_v3p0_v4p0__7_5_7:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    s_mov_b32 s12, s14
-; GFX9-NEXT:    s_mov_b32 s13, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_5_7:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_5_7:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_5_7:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 5, i32 7>
@@ -15587,12 +16131,9 @@ define void @s_shuffle_v3p0_v4p0__7_6_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v2i64.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v2i64.ll
index ac7d9557ce765..f2538521727d1 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v2i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v2i64.ll
@@ -45,6 +45,7 @@ define void @v_shuffle_v4i64_v2i64__0_u_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -84,10 +85,12 @@ define void @v_shuffle_v4i64_v2i64__1_u_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v4i64_v2i64__1_u_u_u:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
+; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX942-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX942-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
@@ -140,10 +143,12 @@ define void @v_shuffle_v4i64_v2i64__3_u_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v4i64_v2i64__3_u_u_u:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
+; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX942-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX942-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
@@ -203,7 +208,8 @@ define void @v_shuffle_v4i64_v2i64__3_0_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[2:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1] offset:16
+; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX942-NEXT:    v_mov_b32_e32 v3, v5
 ; GFX942-NEXT:    v_mov_b32_e32 v4, v0
@@ -254,14 +260,15 @@ define void @v_shuffle_v4i64_v2i64__3_1_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v4i64_v2i64__3_1_u_u:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[4:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1] offset:16
+; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_mov_b32_e32 v0, v6
 ; GFX942-NEXT:    v_mov_b32_e32 v1, v7
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
@@ -308,7 +315,7 @@ define void @v_shuffle_v4i64_v2i64__3_2_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1] offset:16
 ; GFX942-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX942-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[2:5], s[0:1]
@@ -351,10 +358,12 @@ define void @v_shuffle_v4i64_v2i64__3_3_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v4i64_v2i64__3_3_u_u:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
+; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX942-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX942-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
@@ -2362,6 +2371,7 @@ define void @v_shuffle_v4i64_v2i64__0_2_2_2(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -2401,10 +2411,12 @@ define void @v_shuffle_v4i64_v2i64__1_2_2_2(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v4i64_v2i64__1_2_2_2:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
+; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX942-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX942-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
@@ -3734,8 +3746,7 @@ define void @s_shuffle_v4i64_v2i64__1_u_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3793,8 +3804,7 @@ define void @s_shuffle_v4i64_v2i64__3_u_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3852,10 +3862,8 @@ define void @s_shuffle_v4i64_v2i64__3_0_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3909,8 +3917,7 @@ define void @s_shuffle_v4i64_v2i64__3_1_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3959,10 +3966,8 @@ define void @s_shuffle_v4i64_v2i64__3_2_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3975,18 +3980,43 @@ define void @s_shuffle_v4i64_v2i64__3_2_u_u() {
 }
 
 define void @s_shuffle_v4i64_v2i64__3_3_u_u() {
-; GFX9-LABEL: s_shuffle_v4i64_v2i64__3_3_u_u:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4i64_v2i64__3_3_u_u:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v2i64__3_3_u_u:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v2i64__3_3_u_u:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x i64> asm "; def $0", "=s"()
   %vec1 = call <2 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> <i32 3, i32 3, i32 poison, i32 poison>
@@ -3995,21 +4025,52 @@ define void @s_shuffle_v4i64_v2i64__3_3_u_u() {
 }
 
 define void @s_shuffle_v4i64_v2i64__3_3_0_u() {
-; GFX9-LABEL: s_shuffle_v4i64_v2i64__3_3_0_u:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[12:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4i64_v2i64__3_3_0_u:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[12:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v2i64__3_3_0_u:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[12:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v2i64__3_3_0_u:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[12:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x i64> asm "; def $0", "=s"()
   %vec1 = call <2 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> <i32 3, i32 3, i32 0, i32 poison>
@@ -4063,10 +4124,8 @@ define void @s_shuffle_v4i64_v2i64__3_3_1_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -4079,20 +4138,48 @@ define void @s_shuffle_v4i64_v2i64__3_3_1_u() {
 }
 
 define void @s_shuffle_v4i64_v2i64__3_3_2_u() {
-; GFX9-LABEL: s_shuffle_v4i64_v2i64__3_3_2_u:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[12:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    s_mov_b32 s10, s14
-; GFX9-NEXT:    s_mov_b32 s11, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4i64_v2i64__3_3_2_u:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[12:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v2i64__3_3_2_u:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[12:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v2i64__3_3_2_u:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[12:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x i64> asm "; def $0", "=s"()
   %vec1 = call <2 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> <i32 3, i32 3, i32 2, i32 poison>
@@ -4101,20 +4188,48 @@ define void @s_shuffle_v4i64_v2i64__3_3_2_u() {
 }
 
 define void @s_shuffle_v4i64_v2i64__3_3_3_u() {
-; GFX9-LABEL: s_shuffle_v4i64_v2i64__3_3_3_u:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4i64_v2i64__3_3_3_u:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v2i64__3_3_3_u:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v2i64__3_3_3_u:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x i64> asm "; def $0", "=s"()
   %vec1 = call <2 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> <i32 3, i32 3, i32 3, i32 poison>
@@ -4172,12 +4287,9 @@ define void @s_shuffle_v4i64_v2i64__3_3_3_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s12, s10
-; GFX942-NEXT:    s_mov_b32 s13, s11
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -4190,62 +4302,96 @@ define void @s_shuffle_v4i64_v2i64__3_3_3_0() {
 }
 
 define void @s_shuffle_v4i64_v2i64__3_3_3_1() {
-; GFX9-LABEL: s_shuffle_v4i64_v2i64__3_3_3_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[12:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <2 x i64> asm "; def $0", "=s"()
-  %vec1 = call <2 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> <i32 3, i32 3, i32 3, i32 1>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4i64_v2i64__3_3_3_2() {
-; GFX900-LABEL: s_shuffle_v4i64_v2i64__3_3_3_2:
+; GFX900-LABEL: s_shuffle_v4i64_v2i64__3_3_3_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:7]
+; GFX900-NEXT:    ; def s[12:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s6
-; GFX900-NEXT:    s_mov_b32 s9, s7
-; GFX900-NEXT:    s_mov_b32 s10, s6
-; GFX900-NEXT:    s_mov_b32 s11, s7
-; GFX900-NEXT:    s_mov_b32 s12, s6
-; GFX900-NEXT:    s_mov_b32 s13, s7
-; GFX900-NEXT:    s_mov_b32 s14, s4
-; GFX900-NEXT:    s_mov_b32 s15, s5
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v2i64__3_3_3_2:
+; GFX90A-LABEL: s_shuffle_v4i64_v2i64__3_3_3_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:7]
+; GFX90A-NEXT:    ; def s[12:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s6
-; GFX90A-NEXT:    s_mov_b32 s9, s7
-; GFX90A-NEXT:    s_mov_b32 s10, s6
-; GFX90A-NEXT:    s_mov_b32 s11, s7
-; GFX90A-NEXT:    s_mov_b32 s12, s6
-; GFX90A-NEXT:    s_mov_b32 s13, s7
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v2i64__3_3_3_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[12:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <2 x i64> asm "; def $0", "=s"()
+  %vec1 = call <2 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> <i32 3, i32 3, i32 3, i32 1>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4i64_v2i64__3_3_3_2() {
+; GFX900-LABEL: s_shuffle_v4i64_v2i64__3_3_3_2:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:7]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s6
+; GFX900-NEXT:    s_mov_b32 s9, s7
+; GFX900-NEXT:    s_mov_b32 s10, s6
+; GFX900-NEXT:    s_mov_b32 s11, s7
+; GFX900-NEXT:    s_mov_b32 s12, s6
+; GFX900-NEXT:    s_mov_b32 s13, s7
+; GFX900-NEXT:    s_mov_b32 s14, s4
+; GFX900-NEXT:    s_mov_b32 s15, s5
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v2i64__3_3_3_2:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:7]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s6
+; GFX90A-NEXT:    s_mov_b32 s9, s7
+; GFX90A-NEXT:    s_mov_b32 s10, s6
+; GFX90A-NEXT:    s_mov_b32 s11, s7
+; GFX90A-NEXT:    s_mov_b32 s12, s6
+; GFX90A-NEXT:    s_mov_b32 s13, s7
 ; GFX90A-NEXT:    s_mov_b32 s14, s4
 ; GFX90A-NEXT:    s_mov_b32 s15, s5
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -4259,14 +4405,10 @@ define void @s_shuffle_v4i64_v2i64__3_3_3_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s2
-; GFX942-NEXT:    s_mov_b32 s11, s3
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -4279,22 +4421,53 @@ define void @s_shuffle_v4i64_v2i64__3_3_3_2() {
 }
 
 define void @s_shuffle_v4i64_v2i64__3_3_3_3() {
-; GFX9-LABEL: s_shuffle_v4i64_v2i64__3_3_3_3:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    s_mov_b32 s14, s10
-; GFX9-NEXT:    s_mov_b32 s15, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4i64_v2i64__3_3_3_3:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    s_mov_b32 s14, s10
+; GFX900-NEXT:    s_mov_b32 s15, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v2i64__3_3_3_3:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s10
+; GFX90A-NEXT:    s_mov_b32 s15, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v2i64__3_3_3_3:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x i64> asm "; def $0", "=s"()
   %vec1 = call <2 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
@@ -4303,20 +4476,48 @@ define void @s_shuffle_v4i64_v2i64__3_3_3_3() {
 }
 
 define void @s_shuffle_v4i64_v2i64__u_0_0_0() {
-; GFX9-LABEL: s_shuffle_v4i64_v2i64__u_0_0_0:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[12:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    s_mov_b32 s14, s12
-; GFX9-NEXT:    s_mov_b32 s15, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4i64_v2i64__u_0_0_0:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[12:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v2i64__u_0_0_0:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[12:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v2i64__u_0_0_0:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[12:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x i64> %vec0, <2 x i64> poison, <4 x i32> <i32 poison, i32 0, i32 0, i32 0>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
@@ -4324,22 +4525,53 @@ define void @s_shuffle_v4i64_v2i64__u_0_0_0() {
 }
 
 define void @s_shuffle_v4i64_v2i64__0_0_0_0() {
-; GFX9-LABEL: s_shuffle_v4i64_v2i64__0_0_0_0:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s8
-; GFX9-NEXT:    s_mov_b32 s11, s9
-; GFX9-NEXT:    s_mov_b32 s12, s8
-; GFX9-NEXT:    s_mov_b32 s13, s9
-; GFX9-NEXT:    s_mov_b32 s14, s8
-; GFX9-NEXT:    s_mov_b32 s15, s9
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4i64_v2i64__0_0_0_0:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s8
+; GFX900-NEXT:    s_mov_b32 s11, s9
+; GFX900-NEXT:    s_mov_b32 s12, s8
+; GFX900-NEXT:    s_mov_b32 s13, s9
+; GFX900-NEXT:    s_mov_b32 s14, s8
+; GFX900-NEXT:    s_mov_b32 s15, s9
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v2i64__0_0_0_0:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s8
+; GFX90A-NEXT:    s_mov_b32 s11, s9
+; GFX90A-NEXT:    s_mov_b32 s12, s8
+; GFX90A-NEXT:    s_mov_b32 s13, s9
+; GFX90A-NEXT:    s_mov_b32 s14, s8
+; GFX90A-NEXT:    s_mov_b32 s15, s9
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v2i64__0_0_0_0:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[8:9]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[8:9]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[8:9]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x i64> %vec0, <2 x i64> poison, <4 x i32> zeroinitializer
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
@@ -4347,61 +4579,14 @@ define void @s_shuffle_v4i64_v2i64__0_0_0_0() {
 }
 
 define void @s_shuffle_v4i64_v2i64__1_0_0_0() {
-; GFX9-LABEL: s_shuffle_v4i64_v2i64__1_0_0_0:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[12:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    s_mov_b32 s14, s12
-; GFX9-NEXT:    s_mov_b32 s15, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <2 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <2 x i64> %vec0, <2 x i64> poison, <4 x i32> <i32 1, i32 0, i32 0, i32 0>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4i64_v2i64__2_0_0_0() {
-; GFX9-LABEL: s_shuffle_v4i64_v2i64__2_0_0_0:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[12:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    s_mov_b32 s14, s12
-; GFX9-NEXT:    s_mov_b32 s15, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <2 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <2 x i64> %vec0, <2 x i64> poison, <4 x i32> <i32 2, i32 0, i32 0, i32 0>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4i64_v2i64__3_0_0_0() {
-; GFX900-LABEL: s_shuffle_v4i64_v2i64__3_0_0_0:
+; GFX900-LABEL: s_shuffle_v4i64_v2i64__1_0_0_0:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[12:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:7]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s6
-; GFX900-NEXT:    s_mov_b32 s9, s7
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
 ; GFX900-NEXT:    s_mov_b32 s10, s12
 ; GFX900-NEXT:    s_mov_b32 s11, s13
 ; GFX900-NEXT:    s_mov_b32 s14, s12
@@ -4411,54 +4596,526 @@ define void @s_shuffle_v4i64_v2i64__3_0_0_0() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v2i64__3_0_0_0:
+; GFX90A-LABEL: s_shuffle_v4i64_v2i64__1_0_0_0:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[12:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v2i64__1_0_0_0:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[12:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <2 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <2 x i64> %vec0, <2 x i64> poison, <4 x i32> <i32 1, i32 0, i32 0, i32 0>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4i64_v2i64__2_0_0_0() {
+; GFX900-LABEL: s_shuffle_v4i64_v2i64__2_0_0_0:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[12:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v2i64__2_0_0_0:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[12:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v2i64__2_0_0_0:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[12:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <2 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <2 x i64> %vec0, <2 x i64> poison, <4 x i32> <i32 2, i32 0, i32 0, i32 0>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4i64_v2i64__3_0_0_0() {
+; GFX900-LABEL: s_shuffle_v4i64_v2i64__3_0_0_0:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[12:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:7]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s6
+; GFX900-NEXT:    s_mov_b32 s9, s7
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v2i64__3_0_0_0:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[12:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:7]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s6
+; GFX90A-NEXT:    s_mov_b32 s9, s7
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v2i64__3_0_0_0:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[12:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:3]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <2 x i64> asm "; def $0", "=s"()
+  %vec1 = call <2 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> <i32 3, i32 0, i32 0, i32 0>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4i64_v2i64__3_u_0_0() {
+; GFX900-LABEL: s_shuffle_v4i64_v2i64__3_u_0_0:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[12:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:7]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s6
+; GFX900-NEXT:    s_mov_b32 s9, s7
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v2i64__3_u_0_0:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[12:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:7]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s6
+; GFX90A-NEXT:    s_mov_b32 s9, s7
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v2i64__3_u_0_0:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[12:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:3]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <2 x i64> asm "; def $0", "=s"()
+  %vec1 = call <2 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> <i32 3, i32 poison, i32 0, i32 0>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4i64_v2i64__3_1_0_0() {
+; GFX900-LABEL: s_shuffle_v4i64_v2i64__3_1_0_0:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[12:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:7]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s6
+; GFX900-NEXT:    s_mov_b32 s9, s7
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v2i64__3_1_0_0:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[12:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:7]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s6
+; GFX90A-NEXT:    s_mov_b32 s9, s7
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v2i64__3_1_0_0:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[12:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:3]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <2 x i64> asm "; def $0", "=s"()
+  %vec1 = call <2 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> <i32 3, i32 1, i32 0, i32 0>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4i64_v2i64__3_2_0_0() {
+; GFX900-LABEL: s_shuffle_v4i64_v2i64__3_2_0_0:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[12:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:7]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s6
+; GFX900-NEXT:    s_mov_b32 s9, s7
+; GFX900-NEXT:    s_mov_b32 s10, s4
+; GFX900-NEXT:    s_mov_b32 s11, s5
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v2i64__3_2_0_0:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[12:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:7]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s6
+; GFX90A-NEXT:    s_mov_b32 s9, s7
+; GFX90A-NEXT:    s_mov_b32 s10, s4
+; GFX90A-NEXT:    s_mov_b32 s11, s5
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v2i64__3_2_0_0:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[12:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:3]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <2 x i64> asm "; def $0", "=s"()
+  %vec1 = call <2 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> <i32 3, i32 2, i32 0, i32 0>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4i64_v2i64__3_3_0_0() {
+; GFX900-LABEL: s_shuffle_v4i64_v2i64__3_3_0_0:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[12:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v2i64__3_3_0_0:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[12:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v2i64__3_3_0_0:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[12:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <2 x i64> asm "; def $0", "=s"()
+  %vec1 = call <2 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> <i32 3, i32 3, i32 0, i32 0>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4i64_v2i64__3_3_u_0() {
+; GFX900-LABEL: s_shuffle_v4i64_v2i64__3_3_u_0:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:7]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s14, s4
+; GFX900-NEXT:    s_mov_b32 s15, s5
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v2i64__3_3_u_0:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:7]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s4
+; GFX90A-NEXT:    s_mov_b32 s15, s5
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v2i64__3_3_u_0:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:3]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <2 x i64> asm "; def $0", "=s"()
+  %vec1 = call <2 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> <i32 3, i32 3, i32 poison, i32 0>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4i64_v2i64__3_3_1_0() {
+; GFX900-LABEL: s_shuffle_v4i64_v2i64__3_3_1_0:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:7]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s12, s6
+; GFX900-NEXT:    s_mov_b32 s13, s7
+; GFX900-NEXT:    s_mov_b32 s14, s4
+; GFX900-NEXT:    s_mov_b32 s15, s5
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v2i64__3_3_1_0:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:15]
+; GFX90A-NEXT:    ; def s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:7]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s6
-; GFX90A-NEXT:    s_mov_b32 s9, s7
-; GFX90A-NEXT:    s_mov_b32 s10, s12
-; GFX90A-NEXT:    s_mov_b32 s11, s13
-; GFX90A-NEXT:    s_mov_b32 s14, s12
-; GFX90A-NEXT:    s_mov_b32 s15, s13
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s12, s6
+; GFX90A-NEXT:    s_mov_b32 s13, s7
+; GFX90A-NEXT:    s_mov_b32 s14, s4
+; GFX90A-NEXT:    s_mov_b32 s15, s5
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v2i64__3_0_0_0:
+; GFX942-LABEL: s_shuffle_v4i64_v2i64__3_3_1_0:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[12:15]
+; GFX942-NEXT:    ; def s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s14, s12
-; GFX942-NEXT:    s_mov_b32 s15, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x i64> asm "; def $0", "=s"()
   %vec1 = call <2 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> <i32 3, i32 0, i32 0, i32 0>
+  %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> <i32 3, i32 3, i32 1, i32 0>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v2i64__3_u_0_0() {
-; GFX900-LABEL: s_shuffle_v4i64_v2i64__3_u_0_0:
+define void @s_shuffle_v4i64_v2i64__3_3_2_0() {
+; GFX900-LABEL: s_shuffle_v4i64_v2i64__3_3_2_0:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -4467,16 +5124,18 @@ define void @s_shuffle_v4i64_v2i64__3_u_0_0() {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:7]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s6
-; GFX900-NEXT:    s_mov_b32 s9, s7
-; GFX900-NEXT:    s_mov_b32 s14, s12
-; GFX900-NEXT:    s_mov_b32 s15, s13
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    s_mov_b32 s14, s4
+; GFX900-NEXT:    s_mov_b32 s15, s5
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v2i64__3_u_0_0:
+; GFX90A-LABEL: s_shuffle_v4i64_v2i64__3_3_2_0:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -4485,16 +5144,18 @@ define void @s_shuffle_v4i64_v2i64__3_u_0_0() {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:7]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s6
-; GFX90A-NEXT:    s_mov_b32 s9, s7
-; GFX90A-NEXT:    s_mov_b32 s14, s12
-; GFX90A-NEXT:    s_mov_b32 s15, s13
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    s_mov_b32 s14, s4
+; GFX90A-NEXT:    s_mov_b32 s15, s5
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v2i64__3_u_0_0:
+; GFX942-LABEL: s_shuffle_v4i64_v2i64__3_3_2_0:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -4503,182 +5164,223 @@ define void @s_shuffle_v4i64_v2i64__3_u_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s14, s12
-; GFX942-NEXT:    s_mov_b32 s15, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x i64> asm "; def $0", "=s"()
   %vec1 = call <2 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> <i32 3, i32 poison, i32 0, i32 0>
+  %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> <i32 3, i32 3, i32 2, i32 0>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v2i64__3_1_0_0() {
-; GFX900-LABEL: s_shuffle_v4i64_v2i64__3_1_0_0:
+define void @s_shuffle_v4i64_v2i64__u_1_1_1() {
+; GFX900-LABEL: s_shuffle_v4i64_v2i64__u_1_1_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:15]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:7]
+; GFX900-NEXT:    ; def s[8:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s6
-; GFX900-NEXT:    s_mov_b32 s9, s7
-; GFX900-NEXT:    s_mov_b32 s10, s14
-; GFX900-NEXT:    s_mov_b32 s11, s15
-; GFX900-NEXT:    s_mov_b32 s14, s12
-; GFX900-NEXT:    s_mov_b32 s15, s13
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    s_mov_b32 s14, s10
+; GFX900-NEXT:    s_mov_b32 s15, s11
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v2i64__3_1_0_0:
+; GFX90A-LABEL: s_shuffle_v4i64_v2i64__u_1_1_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:15]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:7]
+; GFX90A-NEXT:    ; def s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s6
-; GFX90A-NEXT:    s_mov_b32 s9, s7
-; GFX90A-NEXT:    s_mov_b32 s10, s14
-; GFX90A-NEXT:    s_mov_b32 s11, s15
-; GFX90A-NEXT:    s_mov_b32 s14, s12
-; GFX90A-NEXT:    s_mov_b32 s15, s13
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s10
+; GFX90A-NEXT:    s_mov_b32 s15, s11
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v2i64__3_1_0_0:
+; GFX942-LABEL: s_shuffle_v4i64_v2i64__u_1_1_1:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[12:15]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:3]
+; GFX942-NEXT:    ; def s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
-; GFX942-NEXT:    s_mov_b32 s14, s12
-; GFX942-NEXT:    s_mov_b32 s15, s13
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x i64> asm "; def $0", "=s"()
-  %vec1 = call <2 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> <i32 3, i32 1, i32 0, i32 0>
+  %shuf = shufflevector <2 x i64> %vec0, <2 x i64> poison, <4 x i32> <i32 poison, i32 1, i32 1, i32 1>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v2i64__3_2_0_0() {
-; GFX900-LABEL: s_shuffle_v4i64_v2i64__3_2_0_0:
+define void @s_shuffle_v4i64_v2i64__0_1_1_1() {
+; GFX900-LABEL: s_shuffle_v4i64_v2i64__0_1_1_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:15]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:7]
+; GFX900-NEXT:    ; def s[8:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s6
-; GFX900-NEXT:    s_mov_b32 s9, s7
-; GFX900-NEXT:    s_mov_b32 s10, s4
-; GFX900-NEXT:    s_mov_b32 s11, s5
-; GFX900-NEXT:    s_mov_b32 s14, s12
-; GFX900-NEXT:    s_mov_b32 s15, s13
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    s_mov_b32 s14, s10
+; GFX900-NEXT:    s_mov_b32 s15, s11
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v2i64__3_2_0_0:
+; GFX90A-LABEL: s_shuffle_v4i64_v2i64__0_1_1_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:15]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:7]
+; GFX90A-NEXT:    ; def s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s6
-; GFX90A-NEXT:    s_mov_b32 s9, s7
-; GFX90A-NEXT:    s_mov_b32 s10, s4
-; GFX90A-NEXT:    s_mov_b32 s11, s5
-; GFX90A-NEXT:    s_mov_b32 s14, s12
-; GFX90A-NEXT:    s_mov_b32 s15, s13
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s10
+; GFX90A-NEXT:    s_mov_b32 s15, s11
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v2i64__3_2_0_0:
+; GFX942-LABEL: s_shuffle_v4i64_v2i64__0_1_1_1:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[12:15]
+; GFX942-NEXT:    ; def s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:3]
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <2 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <2 x i64> %vec0, <2 x i64> poison, <4 x i32> <i32 0, i32 1, i32 1, i32 1>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4i64_v2i64__1_1_1_1() {
+; GFX900-LABEL: s_shuffle_v4i64_v2i64__1_1_1_1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    s_mov_b32 s14, s10
+; GFX900-NEXT:    s_mov_b32 s15, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v2i64__1_1_1_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s10
+; GFX90A-NEXT:    s_mov_b32 s15, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v2i64__1_1_1_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s14, s12
-; GFX942-NEXT:    s_mov_b32 s15, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x i64> asm "; def $0", "=s"()
-  %vec1 = call <2 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> <i32 3, i32 2, i32 0, i32 0>
+  %shuf = shufflevector <2 x i64> %vec0, <2 x i64> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v2i64__3_3_0_0() {
-; GFX9-LABEL: s_shuffle_v4i64_v2i64__3_3_0_0:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[12:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    s_mov_b32 s14, s12
-; GFX9-NEXT:    s_mov_b32 s15, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+define void @s_shuffle_v4i64_v2i64__2_1_1_1() {
+; GFX900-LABEL: s_shuffle_v4i64_v2i64__2_1_1_1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    s_mov_b32 s14, s10
+; GFX900-NEXT:    s_mov_b32 s15, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v2i64__2_1_1_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s10
+; GFX90A-NEXT:    s_mov_b32 s15, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v2i64__2_1_1_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x i64> asm "; def $0", "=s"()
-  %vec1 = call <2 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> <i32 3, i32 3, i32 0, i32 0>
+  %shuf = shufflevector <2 x i64> %vec0, <2 x i64> poison, <4 x i32> <i32 2, i32 1, i32 1, i32 1>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v2i64__3_3_u_0() {
-; GFX900-LABEL: s_shuffle_v4i64_v2i64__3_3_u_0:
+define void @s_shuffle_v4i64_v2i64__3_1_1_1() {
+; GFX900-LABEL: s_shuffle_v4i64_v2i64__3_1_1_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -4687,16 +5389,18 @@ define void @s_shuffle_v4i64_v2i64__3_3_u_0() {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:7]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s14, s4
-; GFX900-NEXT:    s_mov_b32 s15, s5
+; GFX900-NEXT:    s_mov_b32 s8, s6
+; GFX900-NEXT:    s_mov_b32 s9, s7
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    s_mov_b32 s14, s10
+; GFX900-NEXT:    s_mov_b32 s15, s11
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v2i64__3_3_u_0:
+; GFX90A-LABEL: s_shuffle_v4i64_v2i64__3_1_1_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -4705,16 +5409,18 @@ define void @s_shuffle_v4i64_v2i64__3_3_u_0() {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:7]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s14, s4
-; GFX90A-NEXT:    s_mov_b32 s15, s5
+; GFX90A-NEXT:    s_mov_b32 s8, s6
+; GFX90A-NEXT:    s_mov_b32 s9, s7
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s10
+; GFX90A-NEXT:    s_mov_b32 s15, s11
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v2i64__3_3_u_0:
+; GFX942-LABEL: s_shuffle_v4i64_v2i64__3_1_1_1:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -4723,90 +5429,81 @@ define void @s_shuffle_v4i64_v2i64__3_3_u_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x i64> asm "; def $0", "=s"()
   %vec1 = call <2 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> <i32 3, i32 3, i32 poison, i32 0>
+  %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> <i32 3, i32 1, i32 1, i32 1>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v2i64__3_3_1_0() {
-; GFX900-LABEL: s_shuffle_v4i64_v2i64__3_3_1_0:
+define void @s_shuffle_v4i64_v2i64__3_u_1_1() {
+; GFX900-LABEL: s_shuffle_v4i64_v2i64__3_u_1_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:11]
+; GFX900-NEXT:    ; def s[12:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:7]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s12, s6
-; GFX900-NEXT:    s_mov_b32 s13, s7
-; GFX900-NEXT:    s_mov_b32 s14, s4
-; GFX900-NEXT:    s_mov_b32 s15, s5
+; GFX900-NEXT:    s_mov_b32 s8, s6
+; GFX900-NEXT:    s_mov_b32 s9, s7
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v2i64__3_3_1_0:
+; GFX90A-LABEL: s_shuffle_v4i64_v2i64__3_u_1_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:11]
+; GFX90A-NEXT:    ; def s[12:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:7]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s12, s6
-; GFX90A-NEXT:    s_mov_b32 s13, s7
-; GFX90A-NEXT:    s_mov_b32 s14, s4
-; GFX90A-NEXT:    s_mov_b32 s15, s5
+; GFX90A-NEXT:    s_mov_b32 s8, s6
+; GFX90A-NEXT:    s_mov_b32 s9, s7
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v2i64__3_3_1_0:
+; GFX942-LABEL: s_shuffle_v4i64_v2i64__3_u_1_1:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[8:11]
+; GFX942-NEXT:    ; def s[12:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x i64> asm "; def $0", "=s"()
   %vec1 = call <2 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> <i32 3, i32 3, i32 1, i32 0>
+  %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> <i32 3, i32 poison, i32 1, i32 1>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v2i64__3_3_2_0() {
-; GFX900-LABEL: s_shuffle_v4i64_v2i64__3_3_2_0:
+define void @s_shuffle_v4i64_v2i64__3_0_1_1() {
+; GFX900-LABEL: s_shuffle_v4i64_v2i64__3_0_1_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -4815,18 +5512,18 @@ define void @s_shuffle_v4i64_v2i64__3_3_2_0() {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:7]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s14
-; GFX900-NEXT:    s_mov_b32 s9, s15
-; GFX900-NEXT:    s_mov_b32 s10, s14
-; GFX900-NEXT:    s_mov_b32 s11, s15
-; GFX900-NEXT:    s_mov_b32 s14, s4
-; GFX900-NEXT:    s_mov_b32 s15, s5
+; GFX900-NEXT:    s_mov_b32 s8, s6
+; GFX900-NEXT:    s_mov_b32 s9, s7
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v2i64__3_3_2_0:
+; GFX90A-LABEL: s_shuffle_v4i64_v2i64__3_0_1_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -4835,18 +5532,18 @@ define void @s_shuffle_v4i64_v2i64__3_3_2_0() {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:7]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s14
-; GFX90A-NEXT:    s_mov_b32 s9, s15
-; GFX90A-NEXT:    s_mov_b32 s10, s14
-; GFX90A-NEXT:    s_mov_b32 s11, s15
-; GFX90A-NEXT:    s_mov_b32 s14, s4
-; GFX90A-NEXT:    s_mov_b32 s15, s5
+; GFX90A-NEXT:    s_mov_b32 s8, s6
+; GFX90A-NEXT:    s_mov_b32 s9, s7
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v2i64__3_3_2_0:
+; GFX942-LABEL: s_shuffle_v4i64_v2i64__3_0_1_1:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -4855,188 +5552,96 @@ define void @s_shuffle_v4i64_v2i64__3_3_2_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x i64> asm "; def $0", "=s"()
   %vec1 = call <2 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> <i32 3, i32 3, i32 2, i32 0>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4i64_v2i64__u_1_1_1() {
-; GFX9-LABEL: s_shuffle_v4i64_v2i64__u_1_1_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    s_mov_b32 s14, s10
-; GFX9-NEXT:    s_mov_b32 s15, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <2 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <2 x i64> %vec0, <2 x i64> poison, <4 x i32> <i32 poison, i32 1, i32 1, i32 1>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4i64_v2i64__0_1_1_1() {
-; GFX9-LABEL: s_shuffle_v4i64_v2i64__0_1_1_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    s_mov_b32 s14, s10
-; GFX9-NEXT:    s_mov_b32 s15, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <2 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <2 x i64> %vec0, <2 x i64> poison, <4 x i32> <i32 0, i32 1, i32 1, i32 1>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4i64_v2i64__1_1_1_1() {
-; GFX9-LABEL: s_shuffle_v4i64_v2i64__1_1_1_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    s_mov_b32 s14, s10
-; GFX9-NEXT:    s_mov_b32 s15, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <2 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <2 x i64> %vec0, <2 x i64> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4i64_v2i64__2_1_1_1() {
-; GFX9-LABEL: s_shuffle_v4i64_v2i64__2_1_1_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    s_mov_b32 s14, s10
-; GFX9-NEXT:    s_mov_b32 s15, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <2 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <2 x i64> %vec0, <2 x i64> poison, <4 x i32> <i32 2, i32 1, i32 1, i32 1>
+  %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> <i32 3, i32 0, i32 1, i32 1>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v2i64__3_1_1_1() {
-; GFX900-LABEL: s_shuffle_v4i64_v2i64__3_1_1_1:
+define void @s_shuffle_v4i64_v2i64__3_2_1_1() {
+; GFX900-LABEL: s_shuffle_v4i64_v2i64__3_2_1_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:11]
+; GFX900-NEXT:    ; def s[12:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:7]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s6
 ; GFX900-NEXT:    s_mov_b32 s9, s7
-; GFX900-NEXT:    s_mov_b32 s12, s10
-; GFX900-NEXT:    s_mov_b32 s13, s11
-; GFX900-NEXT:    s_mov_b32 s14, s10
-; GFX900-NEXT:    s_mov_b32 s15, s11
+; GFX900-NEXT:    s_mov_b32 s10, s4
+; GFX900-NEXT:    s_mov_b32 s11, s5
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v2i64__3_1_1_1:
+; GFX90A-LABEL: s_shuffle_v4i64_v2i64__3_2_1_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:11]
+; GFX90A-NEXT:    ; def s[12:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:7]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s6
 ; GFX90A-NEXT:    s_mov_b32 s9, s7
-; GFX90A-NEXT:    s_mov_b32 s12, s10
-; GFX90A-NEXT:    s_mov_b32 s13, s11
-; GFX90A-NEXT:    s_mov_b32 s14, s10
-; GFX90A-NEXT:    s_mov_b32 s15, s11
+; GFX90A-NEXT:    s_mov_b32 s10, s4
+; GFX90A-NEXT:    s_mov_b32 s11, s5
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v2i64__3_1_1_1:
+; GFX942-LABEL: s_shuffle_v4i64_v2i64__3_2_1_1:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[8:11]
+; GFX942-NEXT:    ; def s[12:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s12, s10
-; GFX942-NEXT:    s_mov_b32 s13, s11
-; GFX942-NEXT:    s_mov_b32 s14, s10
-; GFX942-NEXT:    s_mov_b32 s15, s11
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x i64> asm "; def $0", "=s"()
   %vec1 = call <2 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> <i32 3, i32 1, i32 1, i32 1>
+  %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> <i32 3, i32 2, i32 1, i32 1>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v2i64__3_u_1_1() {
-; GFX900-LABEL: s_shuffle_v4i64_v2i64__3_u_1_1:
+define void @s_shuffle_v4i64_v2i64__3_3_1_1() {
+; GFX900-LABEL: s_shuffle_v4i64_v2i64__3_3_1_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[12:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:7]
+; GFX900-NEXT:    ; def s[8:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s6
-; GFX900-NEXT:    s_mov_b32 s9, s7
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
 ; GFX900-NEXT:    s_mov_b32 s12, s14
 ; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -5044,17 +5649,17 @@ define void @s_shuffle_v4i64_v2i64__3_u_1_1() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v2i64__3_u_1_1:
+; GFX90A-LABEL: s_shuffle_v4i64_v2i64__3_3_1_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[12:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:7]
+; GFX90A-NEXT:    ; def s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s6
-; GFX90A-NEXT:    s_mov_b32 s9, s7
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
 ; GFX90A-NEXT:    s_mov_b32 s12, s14
 ; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -5062,228 +5667,129 @@ define void @s_shuffle_v4i64_v2i64__3_u_1_1() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v2i64__3_u_1_1:
+; GFX942-LABEL: s_shuffle_v4i64_v2i64__3_3_1_1:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[12:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:3]
+; GFX942-NEXT:    ; def s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x i64> asm "; def $0", "=s"()
   %vec1 = call <2 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> <i32 3, i32 poison, i32 1, i32 1>
+  %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> <i32 3, i32 3, i32 1, i32 1>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v2i64__3_0_1_1() {
-; GFX900-LABEL: s_shuffle_v4i64_v2i64__3_0_1_1:
+define void @s_shuffle_v4i64_v2i64__3_3_u_1() {
+; GFX900-LABEL: s_shuffle_v4i64_v2i64__3_3_u_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:15]
+; GFX900-NEXT:    ; def s[8:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:7]
+; GFX900-NEXT:    ; def s[12:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s6
-; GFX900-NEXT:    s_mov_b32 s9, s7
-; GFX900-NEXT:    s_mov_b32 s10, s12
-; GFX900-NEXT:    s_mov_b32 s11, s13
-; GFX900-NEXT:    s_mov_b32 s12, s14
-; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v2i64__3_0_1_1:
+; GFX90A-LABEL: s_shuffle_v4i64_v2i64__3_3_u_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:15]
+; GFX90A-NEXT:    ; def s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:7]
+; GFX90A-NEXT:    ; def s[12:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s6
-; GFX90A-NEXT:    s_mov_b32 s9, s7
-; GFX90A-NEXT:    s_mov_b32 s10, s12
-; GFX90A-NEXT:    s_mov_b32 s11, s13
-; GFX90A-NEXT:    s_mov_b32 s12, s14
-; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v2i64__3_0_1_1:
+; GFX942-LABEL: s_shuffle_v4i64_v2i64__3_3_u_1:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[12:15]
+; GFX942-NEXT:    ; def s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:3]
+; GFX942-NEXT:    ; def s[12:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x i64> asm "; def $0", "=s"()
   %vec1 = call <2 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> <i32 3, i32 0, i32 1, i32 1>
+  %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> <i32 3, i32 3, i32 poison, i32 1>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v2i64__3_2_1_1() {
-; GFX900-LABEL: s_shuffle_v4i64_v2i64__3_2_1_1:
+define void @s_shuffle_v4i64_v2i64__3_3_0_1() {
+; GFX900-LABEL: s_shuffle_v4i64_v2i64__3_3_0_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:15]
+; GFX900-NEXT:    ; def s[8:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:7]
+; GFX900-NEXT:    ; def s[12:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s6
-; GFX900-NEXT:    s_mov_b32 s9, s7
-; GFX900-NEXT:    s_mov_b32 s10, s4
-; GFX900-NEXT:    s_mov_b32 s11, s5
-; GFX900-NEXT:    s_mov_b32 s12, s14
-; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v2i64__3_2_1_1:
+; GFX90A-LABEL: s_shuffle_v4i64_v2i64__3_3_0_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:15]
+; GFX90A-NEXT:    ; def s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:7]
+; GFX90A-NEXT:    ; def s[12:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s6
-; GFX90A-NEXT:    s_mov_b32 s9, s7
-; GFX90A-NEXT:    s_mov_b32 s10, s4
-; GFX90A-NEXT:    s_mov_b32 s11, s5
-; GFX90A-NEXT:    s_mov_b32 s12, s14
-; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v2i64__3_2_1_1:
+; GFX942-LABEL: s_shuffle_v4i64_v2i64__3_3_0_1:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[12:15]
+; GFX942-NEXT:    ; def s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:3]
+; GFX942-NEXT:    ; def s[12:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <2 x i64> asm "; def $0", "=s"()
-  %vec1 = call <2 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> <i32 3, i32 2, i32 1, i32 1>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4i64_v2i64__3_3_1_1() {
-; GFX9-LABEL: s_shuffle_v4i64_v2i64__3_3_1_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[12:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    s_mov_b32 s12, s14
-; GFX9-NEXT:    s_mov_b32 s13, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <2 x i64> asm "; def $0", "=s"()
-  %vec1 = call <2 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> <i32 3, i32 3, i32 1, i32 1>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4i64_v2i64__3_3_u_1() {
-; GFX9-LABEL: s_shuffle_v4i64_v2i64__3_3_u_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[12:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <2 x i64> asm "; def $0", "=s"()
-  %vec1 = call <2 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> <i32 3, i32 3, i32 poison, i32 1>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4i64_v2i64__3_3_0_1() {
-; GFX9-LABEL: s_shuffle_v4i64_v2i64__3_3_0_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[12:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x i64> asm "; def $0", "=s"()
   %vec1 = call <2 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> <i32 3, i32 3, i32 0, i32 1>
@@ -5341,12 +5847,9 @@ define void @s_shuffle_v4i64_v2i64__3_3_2_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5445,8 +5948,7 @@ define void @s_shuffle_v4i64_v2i64__1_2_2_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5472,22 +5974,53 @@ define void @s_shuffle_v4i64_v2i64__2_2_2_2() {
 }
 
 define void @s_shuffle_v4i64_v2i64__3_2_2_2() {
-; GFX9-LABEL: s_shuffle_v4i64_v2i64__3_2_2_2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[12:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    s_mov_b32 s14, s12
-; GFX9-NEXT:    s_mov_b32 s15, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4i64_v2i64__3_2_2_2:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[12:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v2i64__3_2_2_2:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[12:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v2i64__3_2_2_2:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[12:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x i64> asm "; def $0", "=s"()
   %vec1 = call <2 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> <i32 3, i32 2, i32 2, i32 2>
@@ -5496,41 +6029,189 @@ define void @s_shuffle_v4i64_v2i64__3_2_2_2() {
 }
 
 define void @s_shuffle_v4i64_v2i64__3_u_2_2() {
-; GFX9-LABEL: s_shuffle_v4i64_v2i64__3_u_2_2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[12:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    s_mov_b32 s14, s12
-; GFX9-NEXT:    s_mov_b32 s15, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4i64_v2i64__3_u_2_2:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[12:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v2i64__3_u_2_2:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[12:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v2i64__3_u_2_2:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[12:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <2 x i64> asm "; def $0", "=s"()
+  %vec1 = call <2 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> <i32 3, i32 poison, i32 2, i32 2>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4i64_v2i64__3_0_2_2() {
+; GFX900-LABEL: s_shuffle_v4i64_v2i64__3_0_2_2:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[12:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:7]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s10, s4
+; GFX900-NEXT:    s_mov_b32 s11, s5
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v2i64__3_0_2_2:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[12:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:7]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s10, s4
+; GFX90A-NEXT:    s_mov_b32 s11, s5
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v2i64__3_0_2_2:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[12:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:3]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <2 x i64> asm "; def $0", "=s"()
+  %vec1 = call <2 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> <i32 3, i32 0, i32 2, i32 2>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4i64_v2i64__3_1_2_2() {
+; GFX900-LABEL: s_shuffle_v4i64_v2i64__3_1_2_2:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[12:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v2i64__3_1_2_2:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[12:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v2i64__3_1_2_2:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[12:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x i64> asm "; def $0", "=s"()
   %vec1 = call <2 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> <i32 3, i32 poison, i32 2, i32 2>
+  %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> <i32 3, i32 1, i32 2, i32 2>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v2i64__3_0_2_2() {
-; GFX900-LABEL: s_shuffle_v4i64_v2i64__3_0_2_2:
+define void @s_shuffle_v4i64_v2i64__3_3_2_2() {
+; GFX900-LABEL: s_shuffle_v4i64_v2i64__3_3_2_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[12:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:7]
-; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s14
 ; GFX900-NEXT:    s_mov_b32 s9, s15
-; GFX900-NEXT:    s_mov_b32 s10, s4
-; GFX900-NEXT:    s_mov_b32 s11, s5
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
 ; GFX900-NEXT:    s_mov_b32 s14, s12
 ; GFX900-NEXT:    s_mov_b32 s15, s13
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -5538,19 +6219,16 @@ define void @s_shuffle_v4i64_v2i64__3_0_2_2() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v2i64__3_0_2_2:
+; GFX90A-LABEL: s_shuffle_v4i64_v2i64__3_3_2_2:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[12:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:7]
-; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s14
 ; GFX90A-NEXT:    s_mov_b32 s9, s15
-; GFX90A-NEXT:    s_mov_b32 s10, s4
-; GFX90A-NEXT:    s_mov_b32 s11, s5
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
 ; GFX90A-NEXT:    s_mov_b32 s14, s12
 ; GFX90A-NEXT:    s_mov_b32 s15, s13
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -5558,74 +6236,19 @@ define void @s_shuffle_v4i64_v2i64__3_0_2_2() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v2i64__3_0_2_2:
+; GFX942-LABEL: s_shuffle_v4i64_v2i64__3_3_2_2:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[12:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:3]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s14, s12
-; GFX942-NEXT:    s_mov_b32 s15, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <2 x i64> asm "; def $0", "=s"()
-  %vec1 = call <2 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> <i32 3, i32 0, i32 2, i32 2>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4i64_v2i64__3_1_2_2() {
-; GFX9-LABEL: s_shuffle_v4i64_v2i64__3_1_2_2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[12:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    s_mov_b32 s14, s12
-; GFX9-NEXT:    s_mov_b32 s15, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <2 x i64> asm "; def $0", "=s"()
-  %vec1 = call <2 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> <i32 3, i32 1, i32 2, i32 2>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4i64_v2i64__3_3_2_2() {
-; GFX9-LABEL: s_shuffle_v4i64_v2i64__3_3_2_2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[12:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    s_mov_b32 s10, s14
-; GFX9-NEXT:    s_mov_b32 s11, s15
-; GFX9-NEXT:    s_mov_b32 s14, s12
-; GFX9-NEXT:    s_mov_b32 s15, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x i64> asm "; def $0", "=s"()
   %vec1 = call <2 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> <i32 3, i32 3, i32 2, i32 2>
@@ -5674,12 +6297,9 @@ define void @s_shuffle_v4i64_v2i64__3_3_u_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s2
-; GFX942-NEXT:    s_mov_b32 s11, s3
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5741,12 +6361,9 @@ define void @s_shuffle_v4i64_v2i64__3_3_0_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s2
-; GFX942-NEXT:    s_mov_b32 s11, s3
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5812,14 +6429,10 @@ define void @s_shuffle_v4i64_v2i64__3_3_1_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5832,20 +6445,48 @@ define void @s_shuffle_v4i64_v2i64__3_3_1_2() {
 }
 
 define void @s_shuffle_v4i64_v2i64__u_3_3_3() {
-; GFX9-LABEL: s_shuffle_v4i64_v2i64__u_3_3_3:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    s_mov_b32 s14, s10
-; GFX9-NEXT:    s_mov_b32 s15, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4i64_v2i64__u_3_3_3:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    s_mov_b32 s14, s10
+; GFX900-NEXT:    s_mov_b32 s15, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v2i64__u_3_3_3:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s10
+; GFX90A-NEXT:    s_mov_b32 s15, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v2i64__u_3_3_3:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x i64> asm "; def $0", "=s"()
   %vec1 = call <2 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> <i32 poison, i32 3, i32 3, i32 3>
@@ -5854,23 +6495,57 @@ define void @s_shuffle_v4i64_v2i64__u_3_3_3() {
 }
 
 define void @s_shuffle_v4i64_v2i64__0_3_3_3() {
-; GFX9-LABEL: s_shuffle_v4i64_v2i64__0_3_3_3:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[12:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s14
-; GFX9-NEXT:    s_mov_b32 s11, s15
-; GFX9-NEXT:    s_mov_b32 s12, s14
-; GFX9-NEXT:    s_mov_b32 s13, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4i64_v2i64__0_3_3_3:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[12:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v2i64__0_3_3_3:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[12:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v2i64__0_3_3_3:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[12:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x i64> asm "; def $0", "=s"()
   %vec1 = call <2 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> <i32 0, i32 3, i32 3, i32 3>
@@ -5928,12 +6603,9 @@ define void @s_shuffle_v4i64_v2i64__1_3_3_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s12, s10
-; GFX942-NEXT:    s_mov_b32 s13, s11
-; GFX942-NEXT:    s_mov_b32 s14, s10
-; GFX942-NEXT:    s_mov_b32 s15, s11
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5946,20 +6618,48 @@ define void @s_shuffle_v4i64_v2i64__1_3_3_3() {
 }
 
 define void @s_shuffle_v4i64_v2i64__2_3_3_3() {
-; GFX9-LABEL: s_shuffle_v4i64_v2i64__2_3_3_3:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    s_mov_b32 s14, s10
-; GFX9-NEXT:    s_mov_b32 s15, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4i64_v2i64__2_3_3_3:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    s_mov_b32 s14, s10
+; GFX900-NEXT:    s_mov_b32 s15, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v2i64__2_3_3_3:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s10
+; GFX90A-NEXT:    s_mov_b32 s15, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v2i64__2_3_3_3:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x i64> asm "; def $0", "=s"()
   %vec1 = call <2 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> <i32 2, i32 3, i32 3, i32 3>
@@ -5968,20 +6668,48 @@ define void @s_shuffle_v4i64_v2i64__2_3_3_3() {
 }
 
 define void @s_shuffle_v4i64_v2i64__3_u_3_3() {
-; GFX9-LABEL: s_shuffle_v4i64_v2i64__3_u_3_3:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[12:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    s_mov_b32 s12, s14
-; GFX9-NEXT:    s_mov_b32 s13, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4i64_v2i64__3_u_3_3:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[12:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v2i64__3_u_3_3:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[12:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v2i64__3_u_3_3:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[12:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x i64> asm "; def $0", "=s"()
   %vec1 = call <2 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> <i32 3, i32 poison, i32 3, i32 3>
@@ -6039,12 +6767,9 @@ define void @s_shuffle_v4i64_v2i64__3_0_3_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -6057,23 +6782,57 @@ define void @s_shuffle_v4i64_v2i64__3_0_3_3() {
 }
 
 define void @s_shuffle_v4i64_v2i64__3_1_3_3() {
-; GFX9-LABEL: s_shuffle_v4i64_v2i64__3_1_3_3:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[12:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    s_mov_b32 s12, s14
-; GFX9-NEXT:    s_mov_b32 s13, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4i64_v2i64__3_1_3_3:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[12:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v2i64__3_1_3_3:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[12:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v2i64__3_1_3_3:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[12:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x i64> asm "; def $0", "=s"()
   %vec1 = call <2 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> <i32 3, i32 1, i32 3, i32 3>
@@ -6082,22 +6841,53 @@ define void @s_shuffle_v4i64_v2i64__3_1_3_3() {
 }
 
 define void @s_shuffle_v4i64_v2i64__3_2_3_3() {
-; GFX9-LABEL: s_shuffle_v4i64_v2i64__3_2_3_3:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[12:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    s_mov_b32 s12, s14
-; GFX9-NEXT:    s_mov_b32 s13, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4i64_v2i64__3_2_3_3:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[12:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v2i64__3_2_3_3:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[12:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v2i64__3_2_3_3:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[12:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x i64> asm "; def $0", "=s"()
   %vec1 = call <2 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> <i32 3, i32 2, i32 3, i32 3>
@@ -6106,20 +6896,48 @@ define void @s_shuffle_v4i64_v2i64__3_2_3_3() {
 }
 
 define void @s_shuffle_v4i64_v2i64__3_3_u_3() {
-; GFX9-LABEL: s_shuffle_v4i64_v2i64__3_3_u_3:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    s_mov_b32 s14, s10
-; GFX9-NEXT:    s_mov_b32 s15, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4i64_v2i64__3_3_u_3:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s14, s10
+; GFX900-NEXT:    s_mov_b32 s15, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v2i64__3_3_u_3:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s10
+; GFX90A-NEXT:    s_mov_b32 s15, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v2i64__3_3_u_3:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x i64> asm "; def $0", "=s"()
   %vec1 = call <2 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> <i32 3, i32 3, i32 poison, i32 3>
@@ -6128,23 +6946,57 @@ define void @s_shuffle_v4i64_v2i64__3_3_u_3() {
 }
 
 define void @s_shuffle_v4i64_v2i64__3_3_0_3() {
-; GFX9-LABEL: s_shuffle_v4i64_v2i64__3_3_0_3:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[12:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    s_mov_b32 s14, s10
-; GFX9-NEXT:    s_mov_b32 s15, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4i64_v2i64__3_3_0_3:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[12:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s14, s10
+; GFX900-NEXT:    s_mov_b32 s15, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v2i64__3_3_0_3:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[12:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s10
+; GFX90A-NEXT:    s_mov_b32 s15, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v2i64__3_3_0_3:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[12:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x i64> asm "; def $0", "=s"()
   %vec1 = call <2 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> <i32 3, i32 3, i32 0, i32 3>
@@ -6202,12 +7054,9 @@ define void @s_shuffle_v4i64_v2i64__3_3_1_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s10
-; GFX942-NEXT:    s_mov_b32 s15, s11
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -6220,20 +7069,48 @@ define void @s_shuffle_v4i64_v2i64__3_3_1_3() {
 }
 
 define void @s_shuffle_v4i64_v2i64__3_3_2_3() {
-; GFX9-LABEL: s_shuffle_v4i64_v2i64__3_3_2_3:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[12:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    s_mov_b32 s10, s14
-; GFX9-NEXT:    s_mov_b32 s11, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4i64_v2i64__3_3_2_3:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[12:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v2i64__3_3_2_3:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[12:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v2i64__3_3_2_3:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[12:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x i64> asm "; def $0", "=s"()
   %vec1 = call <2 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> <i32 3, i32 3, i32 2, i32 3>
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v3i64.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v3i64.ll
index 8dd4a40d00680..d4f80eab1e28e 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v3i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v3i64.ll
@@ -87,6 +87,7 @@ define void @v_shuffle_v4i64_v3i64__1_u_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -126,10 +127,12 @@ define void @v_shuffle_v4i64_v3i64__2_u_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v4i64_v3i64__2_u_u_u:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1] offset:16
+; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX942-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1]
@@ -182,6 +185,7 @@ define void @v_shuffle_v4i64_v3i64__4_u_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -222,10 +226,12 @@ define void @v_shuffle_v4i64_v3i64__5_u_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v4i64_v3i64__5_u_u_u:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1] offset:16
+; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX942-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1]
@@ -285,11 +291,11 @@ define void @v_shuffle_v4i64_v3i64__5_0_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[2:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v6
-; GFX942-NEXT:    v_mov_b32_e32 v3, v7
+; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1] offset:16
 ; GFX942-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX942-NEXT:    v_mov_b32_e32 v5, v1
+; GFX942-NEXT:    v_mov_b32_e32 v2, v6
+; GFX942-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -336,14 +342,15 @@ define void @v_shuffle_v4i64_v3i64__5_1_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v4i64_v3i64__5_1_u_u:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v10, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_mov_b32_e32 v10, 0
+; GFX942-NEXT:    global_store_dwordx4 v10, v[0:3], s[0:1] offset:16
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[4:9]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_mov_b32_e32 v0, v8
 ; GFX942-NEXT:    v_mov_b32_e32 v1, v9
 ; GFX942-NEXT:    global_store_dwordx4 v10, v[0:3], s[0:1]
@@ -392,14 +399,15 @@ define void @v_shuffle_v4i64_v3i64__5_2_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v4i64_v3i64__5_2_u_u:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v12, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_mov_b32_e32 v12, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[6:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    global_store_dwordx4 v12, v[0:3], s[0:1] offset:16
+; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_mov_b32_e32 v2, v10
 ; GFX942-NEXT:    v_mov_b32_e32 v3, v11
 ; GFX942-NEXT:    global_store_dwordx4 v12, v[2:5], s[0:1]
@@ -446,10 +454,12 @@ define void @v_shuffle_v4i64_v3i64__5_3_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v4i64_v3i64__5_3_u_u:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1] offset:16
+; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX942-NEXT:    v_mov_b32_e32 v3, v5
 ; GFX942-NEXT:    v_mov_b32_e32 v4, v0
@@ -494,10 +504,12 @@ define void @v_shuffle_v4i64_v3i64__5_4_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v4i64_v3i64__5_4_u_u:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1] offset:16
+; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX942-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1]
@@ -540,10 +552,12 @@ define void @v_shuffle_v4i64_v3i64__5_5_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v4i64_v3i64__5_5_u_u:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1] offset:16
+; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX942-NEXT:    v_mov_b32_e32 v3, v5
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[2:5], s[0:1]
@@ -4699,6 +4713,7 @@ define void @v_shuffle_v4i64_v3i64__1_3_3_3(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -4738,10 +4753,12 @@ define void @v_shuffle_v4i64_v3i64__2_3_3_3(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v4i64_v3i64__2_3_3_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1] offset:16
+; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX942-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1]
@@ -7818,8 +7835,7 @@ define void @s_shuffle_v4i64_v3i64__1_u_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -7859,8 +7875,7 @@ define void @s_shuffle_v4i64_v3i64__2_u_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -7918,8 +7933,7 @@ define void @s_shuffle_v4i64_v3i64__4_u_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -7960,8 +7974,7 @@ define void @s_shuffle_v4i64_v3i64__5_u_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8016,11 +8029,11 @@ define void @s_shuffle_v4i64_v3i64__5_0_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s0
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:9]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8070,8 +8083,7 @@ define void @s_shuffle_v4i64_v3i64__5_1_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8125,10 +8137,8 @@ define void @s_shuffle_v4i64_v3i64__5_2_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8173,10 +8183,8 @@ define void @s_shuffle_v4i64_v3i64__5_3_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8189,18 +8197,43 @@ define void @s_shuffle_v4i64_v3i64__5_3_u_u() {
 }
 
 define void @s_shuffle_v4i64_v3i64__5_4_u_u() {
-; GFX9-LABEL: s_shuffle_v4i64_v3i64__5_4_u_u:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s12
-; GFX9-NEXT:    s_mov_b32 s9, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_4_u_u:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s12
+; GFX900-NEXT:    s_mov_b32 s9, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_4_u_u:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s12
+; GFX90A-NEXT:    s_mov_b32 s9, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v3i64__5_4_u_u:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x i64> asm "; def $0", "=s"()
   %vec1 = call <3 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> <i32 5, i32 4, i32 poison, i32 poison>
@@ -8245,10 +8278,8 @@ define void @s_shuffle_v4i64_v3i64__5_5_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8310,12 +8341,9 @@ define void @s_shuffle_v4i64_v3i64__5_5_0_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8377,12 +8405,9 @@ define void @s_shuffle_v4i64_v3i64__5_5_1_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8440,10 +8465,8 @@ define void @s_shuffle_v4i64_v3i64__5_5_2_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8492,12 +8515,9 @@ define void @s_shuffle_v4i64_v3i64__5_5_3_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8550,12 +8570,9 @@ define void @s_shuffle_v4i64_v3i64__5_5_4_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8568,20 +8585,48 @@ define void @s_shuffle_v4i64_v3i64__5_5_4_u() {
 }
 
 define void @s_shuffle_v4i64_v3i64__5_5_5_u() {
-; GFX9-LABEL: s_shuffle_v4i64_v3i64__5_5_5_u:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s12
-; GFX9-NEXT:    s_mov_b32 s9, s13
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_5_5_u:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s12
+; GFX900-NEXT:    s_mov_b32 s9, s13
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_5_5_u:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s12
+; GFX90A-NEXT:    s_mov_b32 s9, s13
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v3i64__5_5_5_u:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x i64> asm "; def $0", "=s"()
   %vec1 = call <3 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> <i32 5, i32 5, i32 5, i32 poison>
@@ -8639,12 +8684,9 @@ define void @s_shuffle_v4i64_v3i64__5_5_5_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8706,12 +8748,9 @@ define void @s_shuffle_v4i64_v3i64__5_5_5_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8773,12 +8812,9 @@ define void @s_shuffle_v4i64_v3i64__5_5_5_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8835,14 +8871,10 @@ define void @s_shuffle_v4i64_v3i64__5_5_5_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s4
-; GFX942-NEXT:    s_mov_b32 s13, s5
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8895,14 +8927,10 @@ define void @s_shuffle_v4i64_v3i64__5_5_5_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s4
-; GFX942-NEXT:    s_mov_b32 s13, s5
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8915,22 +8943,53 @@ define void @s_shuffle_v4i64_v3i64__5_5_5_4() {
 }
 
 define void @s_shuffle_v4i64_v3i64__5_5_5_5() {
-; GFX9-LABEL: s_shuffle_v4i64_v3i64__5_5_5_5:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s12
-; GFX9-NEXT:    s_mov_b32 s9, s13
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    s_mov_b32 s14, s12
-; GFX9-NEXT:    s_mov_b32 s15, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_5_5_5:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s12
+; GFX900-NEXT:    s_mov_b32 s9, s13
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_5_5_5:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s12
+; GFX90A-NEXT:    s_mov_b32 s9, s13
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v3i64__5_5_5_5:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x i64> asm "; def $0", "=s"()
   %vec1 = call <3 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> <i32 5, i32 5, i32 5, i32 5>
@@ -8979,12 +9038,9 @@ define void @s_shuffle_v4i64_v3i64__u_0_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8996,22 +9052,53 @@ define void @s_shuffle_v4i64_v3i64__u_0_0_0() {
 }
 
 define void @s_shuffle_v4i64_v3i64__0_0_0_0() {
-; GFX9-LABEL: s_shuffle_v4i64_v3i64__0_0_0_0:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s8
-; GFX9-NEXT:    s_mov_b32 s11, s9
-; GFX9-NEXT:    s_mov_b32 s12, s8
-; GFX9-NEXT:    s_mov_b32 s13, s9
-; GFX9-NEXT:    s_mov_b32 s14, s8
-; GFX9-NEXT:    s_mov_b32 s15, s9
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4i64_v3i64__0_0_0_0:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s8
+; GFX900-NEXT:    s_mov_b32 s11, s9
+; GFX900-NEXT:    s_mov_b32 s12, s8
+; GFX900-NEXT:    s_mov_b32 s13, s9
+; GFX900-NEXT:    s_mov_b32 s14, s8
+; GFX900-NEXT:    s_mov_b32 s15, s9
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v3i64__0_0_0_0:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s8
+; GFX90A-NEXT:    s_mov_b32 s11, s9
+; GFX90A-NEXT:    s_mov_b32 s12, s8
+; GFX90A-NEXT:    s_mov_b32 s13, s9
+; GFX90A-NEXT:    s_mov_b32 s14, s8
+; GFX90A-NEXT:    s_mov_b32 s15, s9
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v3i64__0_0_0_0:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[8:9]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[8:9]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[8:9]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> zeroinitializer
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
@@ -9063,14 +9150,10 @@ define void @s_shuffle_v4i64_v3i64__1_0_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -9122,14 +9205,10 @@ define void @s_shuffle_v4i64_v3i64__2_0_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -9181,12 +9260,9 @@ define void @s_shuffle_v4i64_v3i64__3_0_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -9248,17 +9324,13 @@ define void @s_shuffle_v4i64_v3i64__4_0_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s0
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:9]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -9321,15 +9393,12 @@ define void @s_shuffle_v4i64_v3i64__5_0_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s0
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:9]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -9388,13 +9457,11 @@ define void @s_shuffle_v4i64_v3i64__5_u_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s0
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:9]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -9457,15 +9524,12 @@ define void @s_shuffle_v4i64_v3i64__5_1_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s2
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:9]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s11, s3
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -9527,14 +9591,10 @@ define void @s_shuffle_v4i64_v3i64__5_2_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -9597,15 +9657,12 @@ define void @s_shuffle_v4i64_v3i64__5_3_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s0
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:9]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -9667,12 +9724,9 @@ define void @s_shuffle_v4i64_v3i64__5_4_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -9738,14 +9792,10 @@ define void @s_shuffle_v4i64_v3i64__5_5_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -9807,12 +9857,9 @@ define void @s_shuffle_v4i64_v3i64__5_5_u_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -9878,14 +9925,10 @@ define void @s_shuffle_v4i64_v3i64__5_5_1_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -9951,14 +9994,10 @@ define void @s_shuffle_v4i64_v3i64__5_5_2_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s12, s4
-; GFX942-NEXT:    s_mov_b32 s13, s5
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -10020,12 +10059,9 @@ define void @s_shuffle_v4i64_v3i64__5_5_3_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s16
-; GFX942-NEXT:    s_mov_b32 s9, s17
-; GFX942-NEXT:    s_mov_b32 s10, s16
-; GFX942-NEXT:    s_mov_b32 s11, s17
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[16:17]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[16:17]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -10091,14 +10127,10 @@ define void @s_shuffle_v4i64_v3i64__5_5_4_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s16
-; GFX942-NEXT:    s_mov_b32 s9, s17
-; GFX942-NEXT:    s_mov_b32 s10, s16
-; GFX942-NEXT:    s_mov_b32 s11, s17
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[16:17]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[16:17]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -10111,87 +10143,205 @@ define void @s_shuffle_v4i64_v3i64__5_5_4_0() {
 }
 
 define void @s_shuffle_v4i64_v3i64__u_1_1_1() {
-; GFX9-LABEL: s_shuffle_v4i64_v3i64__u_1_1_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    s_mov_b32 s14, s10
-; GFX9-NEXT:    s_mov_b32 s15, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <3 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> <i32 poison, i32 1, i32 1, i32 1>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4i64_v3i64__0_1_1_1() {
-; GFX9-LABEL: s_shuffle_v4i64_v3i64__0_1_1_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    s_mov_b32 s14, s10
-; GFX9-NEXT:    s_mov_b32 s15, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <3 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> <i32 0, i32 1, i32 1, i32 1>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4i64_v3i64__1_1_1_1() {
-; GFX9-LABEL: s_shuffle_v4i64_v3i64__1_1_1_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    s_mov_b32 s14, s10
-; GFX9-NEXT:    s_mov_b32 s15, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <3 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+; GFX900-LABEL: s_shuffle_v4i64_v3i64__u_1_1_1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    s_mov_b32 s14, s10
+; GFX900-NEXT:    s_mov_b32 s15, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v3i64__u_1_1_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s10
+; GFX90A-NEXT:    s_mov_b32 s15, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v3i64__u_1_1_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <3 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> <i32 poison, i32 1, i32 1, i32 1>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4i64_v3i64__0_1_1_1() {
+; GFX900-LABEL: s_shuffle_v4i64_v3i64__0_1_1_1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    s_mov_b32 s14, s10
+; GFX900-NEXT:    s_mov_b32 s15, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v3i64__0_1_1_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s10
+; GFX90A-NEXT:    s_mov_b32 s15, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v3i64__0_1_1_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <3 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> <i32 0, i32 1, i32 1, i32 1>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4i64_v3i64__1_1_1_1() {
+; GFX900-LABEL: s_shuffle_v4i64_v3i64__1_1_1_1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    s_mov_b32 s14, s10
+; GFX900-NEXT:    s_mov_b32 s15, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v3i64__1_1_1_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s10
+; GFX90A-NEXT:    s_mov_b32 s15, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v3i64__1_1_1_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <3 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
 define void @s_shuffle_v4i64_v3i64__2_1_1_1() {
-; GFX9-LABEL: s_shuffle_v4i64_v3i64__2_1_1_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s12
-; GFX9-NEXT:    s_mov_b32 s9, s13
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    s_mov_b32 s14, s10
-; GFX9-NEXT:    s_mov_b32 s15, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4i64_v3i64__2_1_1_1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s12
+; GFX900-NEXT:    s_mov_b32 s9, s13
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    s_mov_b32 s14, s10
+; GFX900-NEXT:    s_mov_b32 s15, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v3i64__2_1_1_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s12
+; GFX90A-NEXT:    s_mov_b32 s9, s13
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s10
+; GFX90A-NEXT:    s_mov_b32 s15, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v3i64__2_1_1_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> <i32 2, i32 1, i32 1, i32 1>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
@@ -10199,20 +10349,48 @@ define void @s_shuffle_v4i64_v3i64__2_1_1_1() {
 }
 
 define void @s_shuffle_v4i64_v3i64__3_1_1_1() {
-; GFX9-LABEL: s_shuffle_v4i64_v3i64__3_1_1_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    s_mov_b32 s14, s10
-; GFX9-NEXT:    s_mov_b32 s15, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4i64_v3i64__3_1_1_1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    s_mov_b32 s14, s10
+; GFX900-NEXT:    s_mov_b32 s15, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v3i64__3_1_1_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s10
+; GFX90A-NEXT:    s_mov_b32 s15, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v3i64__3_1_1_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> <i32 3, i32 1, i32 1, i32 1>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
@@ -10269,12 +10447,9 @@ define void @s_shuffle_v4i64_v3i64__4_1_1_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s12, s10
-; GFX942-NEXT:    s_mov_b32 s13, s11
-; GFX942-NEXT:    s_mov_b32 s14, s10
-; GFX942-NEXT:    s_mov_b32 s15, s11
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -10332,12 +10507,9 @@ define void @s_shuffle_v4i64_v3i64__5_1_1_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s12, s10
-; GFX942-NEXT:    s_mov_b32 s13, s11
-; GFX942-NEXT:    s_mov_b32 s14, s10
-; GFX942-NEXT:    s_mov_b32 s15, s11
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -10396,13 +10568,11 @@ define void @s_shuffle_v4i64_v3i64__5_u_1_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s2
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:9]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -10465,15 +10635,12 @@ define void @s_shuffle_v4i64_v3i64__5_0_1_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s0
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:9]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -10531,14 +10698,10 @@ define void @s_shuffle_v4i64_v3i64__5_2_1_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -10601,15 +10764,12 @@ define void @s_shuffle_v4i64_v3i64__5_3_1_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s2
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:9]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -10671,12 +10831,9 @@ define void @s_shuffle_v4i64_v3i64__5_4_1_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -10742,14 +10899,10 @@ define void @s_shuffle_v4i64_v3i64__5_5_1_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -10811,12 +10964,9 @@ define void @s_shuffle_v4i64_v3i64__5_5_u_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -10882,14 +11032,10 @@ define void @s_shuffle_v4i64_v3i64__5_5_0_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -10951,14 +11097,10 @@ define void @s_shuffle_v4i64_v3i64__5_5_2_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s12, s4
-; GFX942-NEXT:    s_mov_b32 s13, s5
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -11020,12 +11162,9 @@ define void @s_shuffle_v4i64_v3i64__5_5_3_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s16
-; GFX942-NEXT:    s_mov_b32 s9, s17
-; GFX942-NEXT:    s_mov_b32 s10, s16
-; GFX942-NEXT:    s_mov_b32 s11, s17
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[16:17]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[16:17]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -11091,14 +11230,10 @@ define void @s_shuffle_v4i64_v3i64__5_5_4_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s16
-; GFX942-NEXT:    s_mov_b32 s9, s17
-; GFX942-NEXT:    s_mov_b32 s10, s16
-; GFX942-NEXT:    s_mov_b32 s11, s17
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[16:17]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[16:17]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -11111,20 +11246,48 @@ define void @s_shuffle_v4i64_v3i64__5_5_4_1() {
 }
 
 define void @s_shuffle_v4i64_v3i64__u_2_2_2() {
-; GFX9-LABEL: s_shuffle_v4i64_v3i64__u_2_2_2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    s_mov_b32 s14, s12
-; GFX9-NEXT:    s_mov_b32 s15, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4i64_v3i64__u_2_2_2:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v3i64__u_2_2_2:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v3i64__u_2_2_2:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> <i32 poison, i32 2, i32 2, i32 2>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
@@ -11132,20 +11295,48 @@ define void @s_shuffle_v4i64_v3i64__u_2_2_2() {
 }
 
 define void @s_shuffle_v4i64_v3i64__0_2_2_2() {
-; GFX9-LABEL: s_shuffle_v4i64_v3i64__0_2_2_2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    s_mov_b32 s14, s12
-; GFX9-NEXT:    s_mov_b32 s15, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4i64_v3i64__0_2_2_2:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v3i64__0_2_2_2:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v3i64__0_2_2_2:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> <i32 0, i32 2, i32 2, i32 2>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
@@ -11153,22 +11344,53 @@ define void @s_shuffle_v4i64_v3i64__0_2_2_2() {
 }
 
 define void @s_shuffle_v4i64_v3i64__1_2_2_2() {
-; GFX9-LABEL: s_shuffle_v4i64_v3i64__1_2_2_2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    s_mov_b32 s14, s12
-; GFX9-NEXT:    s_mov_b32 s15, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4i64_v3i64__1_2_2_2:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v3i64__1_2_2_2:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v3i64__1_2_2_2:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> <i32 1, i32 2, i32 2, i32 2>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
@@ -11176,22 +11398,53 @@ define void @s_shuffle_v4i64_v3i64__1_2_2_2() {
 }
 
 define void @s_shuffle_v4i64_v3i64__2_2_2_2() {
-; GFX9-LABEL: s_shuffle_v4i64_v3i64__2_2_2_2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s12
-; GFX9-NEXT:    s_mov_b32 s9, s13
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    s_mov_b32 s14, s12
-; GFX9-NEXT:    s_mov_b32 s15, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4i64_v3i64__2_2_2_2:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s12
+; GFX900-NEXT:    s_mov_b32 s9, s13
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v3i64__2_2_2_2:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s12
+; GFX90A-NEXT:    s_mov_b32 s9, s13
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v3i64__2_2_2_2:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
@@ -11199,20 +11452,48 @@ define void @s_shuffle_v4i64_v3i64__2_2_2_2() {
 }
 
 define void @s_shuffle_v4i64_v3i64__3_2_2_2() {
-; GFX9-LABEL: s_shuffle_v4i64_v3i64__3_2_2_2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    s_mov_b32 s14, s12
-; GFX9-NEXT:    s_mov_b32 s15, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4i64_v3i64__3_2_2_2:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v3i64__3_2_2_2:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v3i64__3_2_2_2:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> <i32 3, i32 2, i32 2, i32 2>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
@@ -11269,12 +11550,9 @@ define void @s_shuffle_v4i64_v3i64__4_2_2_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s14, s12
-; GFX942-NEXT:    s_mov_b32 s15, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -11332,12 +11610,9 @@ define void @s_shuffle_v4i64_v3i64__5_2_2_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s14, s12
-; GFX942-NEXT:    s_mov_b32 s15, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -11391,10 +11666,8 @@ define void @s_shuffle_v4i64_v3i64__5_u_2_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s14, s12
-; GFX942-NEXT:    s_mov_b32 s15, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -11456,14 +11729,10 @@ define void @s_shuffle_v4i64_v3i64__5_0_2_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s4
-; GFX942-NEXT:    s_mov_b32 s13, s5
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -11517,10 +11786,8 @@ define void @s_shuffle_v4i64_v3i64__5_1_2_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s14, s12
-; GFX942-NEXT:    s_mov_b32 s15, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -11578,12 +11845,9 @@ define void @s_shuffle_v4i64_v3i64__5_3_2_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s14, s12
-; GFX942-NEXT:    s_mov_b32 s15, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -11645,12 +11909,9 @@ define void @s_shuffle_v4i64_v3i64__5_4_2_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s12, s4
-; GFX942-NEXT:    s_mov_b32 s13, s5
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -11712,12 +11973,9 @@ define void @s_shuffle_v4i64_v3i64__5_5_2_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s14, s12
-; GFX942-NEXT:    s_mov_b32 s15, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -11779,12 +12037,9 @@ define void @s_shuffle_v4i64_v3i64__5_5_u_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -11846,14 +12101,10 @@ define void @s_shuffle_v4i64_v3i64__5_5_0_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -11919,14 +12170,10 @@ define void @s_shuffle_v4i64_v3i64__5_5_1_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -11992,12 +12239,9 @@ define void @s_shuffle_v4i64_v3i64__5_5_3_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s16
-; GFX942-NEXT:    s_mov_b32 s9, s17
-; GFX942-NEXT:    s_mov_b32 s10, s16
-; GFX942-NEXT:    s_mov_b32 s11, s17
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[16:17]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[16:17]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -12063,14 +12307,10 @@ define void @s_shuffle_v4i64_v3i64__5_5_4_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s16
-; GFX942-NEXT:    s_mov_b32 s9, s17
-; GFX942-NEXT:    s_mov_b32 s10, s16
-; GFX942-NEXT:    s_mov_b32 s11, s17
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[16:17]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[16:17]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -12169,8 +12409,7 @@ define void @s_shuffle_v4i64_v3i64__1_3_3_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -12210,8 +12449,7 @@ define void @s_shuffle_v4i64_v3i64__2_3_3_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -12281,14 +12519,10 @@ define void @s_shuffle_v4i64_v3i64__4_3_3_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -12341,14 +12575,10 @@ define void @s_shuffle_v4i64_v3i64__5_3_3_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -12397,12 +12627,9 @@ define void @s_shuffle_v4i64_v3i64__5_u_3_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -12465,15 +12692,12 @@ define void @s_shuffle_v4i64_v3i64__5_0_3_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s0
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:9]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s4
-; GFX942-NEXT:    s_mov_b32 s13, s5
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -12531,12 +12755,9 @@ define void @s_shuffle_v4i64_v3i64__5_1_3_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -12598,14 +12819,10 @@ define void @s_shuffle_v4i64_v3i64__5_2_3_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[16:21]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s20
-; GFX942-NEXT:    s_mov_b32 s9, s21
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s16
-; GFX942-NEXT:    s_mov_b32 s13, s17
-; GFX942-NEXT:    s_mov_b32 s14, s16
-; GFX942-NEXT:    s_mov_b32 s15, s17
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[20:21]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[16:17]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[16:17]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -12658,14 +12875,10 @@ define void @s_shuffle_v4i64_v3i64__5_4_3_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s2
-; GFX942-NEXT:    s_mov_b32 s11, s3
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -12722,14 +12935,10 @@ define void @s_shuffle_v4i64_v3i64__5_5_3_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -12782,12 +12991,9 @@ define void @s_shuffle_v4i64_v3i64__5_5_u_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -12853,14 +13059,10 @@ define void @s_shuffle_v4i64_v3i64__5_5_0_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[16:21]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s20
-; GFX942-NEXT:    s_mov_b32 s9, s21
-; GFX942-NEXT:    s_mov_b32 s10, s20
-; GFX942-NEXT:    s_mov_b32 s11, s21
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s16
-; GFX942-NEXT:    s_mov_b32 s15, s17
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[20:21]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[20:21]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[16:17]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -12926,14 +13128,10 @@ define void @s_shuffle_v4i64_v3i64__5_5_1_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[16:21]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s20
-; GFX942-NEXT:    s_mov_b32 s9, s21
-; GFX942-NEXT:    s_mov_b32 s10, s20
-; GFX942-NEXT:    s_mov_b32 s11, s21
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s16
-; GFX942-NEXT:    s_mov_b32 s15, s17
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[20:21]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[20:21]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[16:17]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -12995,12 +13193,9 @@ define void @s_shuffle_v4i64_v3i64__5_5_2_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -13057,14 +13252,10 @@ define void @s_shuffle_v4i64_v3i64__5_5_4_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -13077,20 +13268,48 @@ define void @s_shuffle_v4i64_v3i64__5_5_4_3() {
 }
 
 define void @s_shuffle_v4i64_v3i64__u_4_4_4() {
-; GFX9-LABEL: s_shuffle_v4i64_v3i64__u_4_4_4:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    s_mov_b32 s14, s10
-; GFX9-NEXT:    s_mov_b32 s15, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4i64_v3i64__u_4_4_4:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    s_mov_b32 s14, s10
+; GFX900-NEXT:    s_mov_b32 s15, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v3i64__u_4_4_4:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s10
+; GFX90A-NEXT:    s_mov_b32 s15, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v3i64__u_4_4_4:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x i64> asm "; def $0", "=s"()
   %vec1 = call <3 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> <i32 poison, i32 4, i32 4, i32 4>
@@ -13144,12 +13363,9 @@ define void @s_shuffle_v4i64_v3i64__0_4_4_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s2
-; GFX942-NEXT:    s_mov_b32 s11, s3
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -13211,12 +13427,9 @@ define void @s_shuffle_v4i64_v3i64__1_4_4_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s12, s10
-; GFX942-NEXT:    s_mov_b32 s13, s11
-; GFX942-NEXT:    s_mov_b32 s14, s10
-; GFX942-NEXT:    s_mov_b32 s15, s11
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -13278,12 +13491,9 @@ define void @s_shuffle_v4i64_v3i64__2_4_4_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s12, s10
-; GFX942-NEXT:    s_mov_b32 s13, s11
-; GFX942-NEXT:    s_mov_b32 s14, s10
-; GFX942-NEXT:    s_mov_b32 s15, s11
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -13296,20 +13506,48 @@ define void @s_shuffle_v4i64_v3i64__2_4_4_4() {
 }
 
 define void @s_shuffle_v4i64_v3i64__3_4_4_4() {
-; GFX9-LABEL: s_shuffle_v4i64_v3i64__3_4_4_4:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    s_mov_b32 s14, s10
-; GFX9-NEXT:    s_mov_b32 s15, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4i64_v3i64__3_4_4_4:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    s_mov_b32 s14, s10
+; GFX900-NEXT:    s_mov_b32 s15, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v3i64__3_4_4_4:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s10
+; GFX90A-NEXT:    s_mov_b32 s15, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v3i64__3_4_4_4:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x i64> asm "; def $0", "=s"()
   %vec1 = call <3 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> <i32 3, i32 4, i32 4, i32 4>
@@ -13318,22 +13556,53 @@ define void @s_shuffle_v4i64_v3i64__3_4_4_4() {
 }
 
 define void @s_shuffle_v4i64_v3i64__4_4_4_4() {
-; GFX9-LABEL: s_shuffle_v4i64_v3i64__4_4_4_4:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    s_mov_b32 s14, s10
-; GFX9-NEXT:    s_mov_b32 s15, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4i64_v3i64__4_4_4_4:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    s_mov_b32 s14, s10
+; GFX900-NEXT:    s_mov_b32 s15, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v3i64__4_4_4_4:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s10
+; GFX90A-NEXT:    s_mov_b32 s15, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v3i64__4_4_4_4:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x i64> asm "; def $0", "=s"()
   %vec1 = call <3 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> <i32 4, i32 4, i32 4, i32 4>
@@ -13342,22 +13611,53 @@ define void @s_shuffle_v4i64_v3i64__4_4_4_4() {
 }
 
 define void @s_shuffle_v4i64_v3i64__5_4_4_4() {
-; GFX9-LABEL: s_shuffle_v4i64_v3i64__5_4_4_4:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s12
-; GFX9-NEXT:    s_mov_b32 s9, s13
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    s_mov_b32 s14, s10
-; GFX9-NEXT:    s_mov_b32 s15, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_4_4_4:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s12
+; GFX900-NEXT:    s_mov_b32 s9, s13
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    s_mov_b32 s14, s10
+; GFX900-NEXT:    s_mov_b32 s15, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_4_4_4:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s12
+; GFX90A-NEXT:    s_mov_b32 s9, s13
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s10
+; GFX90A-NEXT:    s_mov_b32 s15, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v3i64__5_4_4_4:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x i64> asm "; def $0", "=s"()
   %vec1 = call <3 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> <i32 5, i32 4, i32 4, i32 4>
@@ -13402,12 +13702,9 @@ define void @s_shuffle_v4i64_v3i64__5_u_4_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -13466,15 +13763,12 @@ define void @s_shuffle_v4i64_v3i64__5_0_4_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s0
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:9]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
-; GFX942-NEXT:    s_mov_b32 s14, s6
-; GFX942-NEXT:    s_mov_b32 s15, s7
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -13532,12 +13826,9 @@ define void @s_shuffle_v4i64_v3i64__5_1_4_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -13599,12 +13890,9 @@ define void @s_shuffle_v4i64_v3i64__5_2_4_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s16
-; GFX942-NEXT:    s_mov_b32 s9, s17
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[16:17]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -13657,14 +13945,10 @@ define void @s_shuffle_v4i64_v3i64__5_3_4_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -13717,14 +14001,10 @@ define void @s_shuffle_v4i64_v3i64__5_5_4_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -13773,12 +14053,9 @@ define void @s_shuffle_v4i64_v3i64__5_5_u_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -13840,12 +14117,9 @@ define void @s_shuffle_v4i64_v3i64__5_5_0_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s16
-; GFX942-NEXT:    s_mov_b32 s9, s17
-; GFX942-NEXT:    s_mov_b32 s10, s16
-; GFX942-NEXT:    s_mov_b32 s11, s17
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[16:17]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[16:17]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -13907,12 +14181,9 @@ define void @s_shuffle_v4i64_v3i64__5_5_1_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s16
-; GFX942-NEXT:    s_mov_b32 s9, s17
-; GFX942-NEXT:    s_mov_b32 s10, s16
-; GFX942-NEXT:    s_mov_b32 s11, s17
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[16:17]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[16:17]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -13974,12 +14245,9 @@ define void @s_shuffle_v4i64_v3i64__5_5_2_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14028,14 +14296,10 @@ define void @s_shuffle_v4i64_v3i64__5_5_3_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14048,20 +14312,48 @@ define void @s_shuffle_v4i64_v3i64__5_5_3_4() {
 }
 
 define void @s_shuffle_v4i64_v3i64__u_5_5_5() {
-; GFX9-LABEL: s_shuffle_v4i64_v3i64__u_5_5_5:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    s_mov_b32 s14, s12
-; GFX9-NEXT:    s_mov_b32 s15, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4i64_v3i64__u_5_5_5:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v3i64__u_5_5_5:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v3i64__u_5_5_5:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x i64> asm "; def $0", "=s"()
   %vec1 = call <3 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> <i32 poison, i32 5, i32 5, i32 5>
@@ -14119,12 +14411,9 @@ define void @s_shuffle_v4i64_v3i64__0_5_5_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s4
-; GFX942-NEXT:    s_mov_b32 s13, s5
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14186,12 +14475,9 @@ define void @s_shuffle_v4i64_v3i64__1_5_5_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s14, s12
-; GFX942-NEXT:    s_mov_b32 s15, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14253,12 +14539,9 @@ define void @s_shuffle_v4i64_v3i64__2_5_5_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s14, s12
-; GFX942-NEXT:    s_mov_b32 s15, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14271,20 +14554,48 @@ define void @s_shuffle_v4i64_v3i64__2_5_5_5() {
 }
 
 define void @s_shuffle_v4i64_v3i64__3_5_5_5() {
-; GFX9-LABEL: s_shuffle_v4i64_v3i64__3_5_5_5:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    s_mov_b32 s14, s12
-; GFX9-NEXT:    s_mov_b32 s15, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4i64_v3i64__3_5_5_5:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v3i64__3_5_5_5:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v3i64__3_5_5_5:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x i64> asm "; def $0", "=s"()
   %vec1 = call <3 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> <i32 3, i32 5, i32 5, i32 5>
@@ -14293,22 +14604,53 @@ define void @s_shuffle_v4i64_v3i64__3_5_5_5() {
 }
 
 define void @s_shuffle_v4i64_v3i64__4_5_5_5() {
-; GFX9-LABEL: s_shuffle_v4i64_v3i64__4_5_5_5:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    s_mov_b32 s14, s12
-; GFX9-NEXT:    s_mov_b32 s15, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4i64_v3i64__4_5_5_5:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v3i64__4_5_5_5:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v3i64__4_5_5_5:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x i64> asm "; def $0", "=s"()
   %vec1 = call <3 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> <i32 4, i32 5, i32 5, i32 5>
@@ -14317,20 +14659,48 @@ define void @s_shuffle_v4i64_v3i64__4_5_5_5() {
 }
 
 define void @s_shuffle_v4i64_v3i64__5_u_5_5() {
-; GFX9-LABEL: s_shuffle_v4i64_v3i64__5_u_5_5:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s12
-; GFX9-NEXT:    s_mov_b32 s9, s13
-; GFX9-NEXT:    s_mov_b32 s14, s12
-; GFX9-NEXT:    s_mov_b32 s15, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_u_5_5:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s12
+; GFX900-NEXT:    s_mov_b32 s9, s13
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_u_5_5:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s12
+; GFX90A-NEXT:    s_mov_b32 s9, s13
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v3i64__5_u_5_5:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x i64> asm "; def $0", "=s"()
   %vec1 = call <3 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> <i32 5, i32 poison, i32 5, i32 5>
@@ -14388,12 +14758,9 @@ define void @s_shuffle_v4i64_v3i64__5_0_5_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s14, s12
-; GFX942-NEXT:    s_mov_b32 s15, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14455,12 +14822,9 @@ define void @s_shuffle_v4i64_v3i64__5_1_5_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s12, s4
-; GFX942-NEXT:    s_mov_b32 s13, s5
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14522,12 +14886,9 @@ define void @s_shuffle_v4i64_v3i64__5_2_5_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s14, s12
-; GFX942-NEXT:    s_mov_b32 s15, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14584,14 +14945,10 @@ define void @s_shuffle_v4i64_v3i64__5_3_5_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s4
-; GFX942-NEXT:    s_mov_b32 s13, s5
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14604,20 +14961,48 @@ define void @s_shuffle_v4i64_v3i64__5_3_5_5() {
 }
 
 define void @s_shuffle_v4i64_v3i64__5_4_5_5() {
-; GFX9-LABEL: s_shuffle_v4i64_v3i64__5_4_5_5:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s12
-; GFX9-NEXT:    s_mov_b32 s9, s13
-; GFX9-NEXT:    s_mov_b32 s14, s12
-; GFX9-NEXT:    s_mov_b32 s15, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_4_5_5:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s12
+; GFX900-NEXT:    s_mov_b32 s9, s13
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_4_5_5:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s12
+; GFX90A-NEXT:    s_mov_b32 s9, s13
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v3i64__5_4_5_5:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x i64> asm "; def $0", "=s"()
   %vec1 = call <3 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> <i32 5, i32 4, i32 5, i32 5>
@@ -14666,12 +15051,9 @@ define void @s_shuffle_v4i64_v3i64__5_5_u_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14737,14 +15119,10 @@ define void @s_shuffle_v4i64_v3i64__5_5_0_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s16
-; GFX942-NEXT:    s_mov_b32 s9, s17
-; GFX942-NEXT:    s_mov_b32 s10, s16
-; GFX942-NEXT:    s_mov_b32 s11, s17
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s16
-; GFX942-NEXT:    s_mov_b32 s15, s17
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[16:17]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[16:17]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[16:17]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14810,14 +15188,10 @@ define void @s_shuffle_v4i64_v3i64__5_5_1_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s16
-; GFX942-NEXT:    s_mov_b32 s9, s17
-; GFX942-NEXT:    s_mov_b32 s10, s16
-; GFX942-NEXT:    s_mov_b32 s11, s17
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s16
-; GFX942-NEXT:    s_mov_b32 s15, s17
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[16:17]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[16:17]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[16:17]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14879,12 +15253,9 @@ define void @s_shuffle_v4i64_v3i64__5_5_2_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14937,14 +15308,10 @@ define void @s_shuffle_v4i64_v3i64__5_5_3_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -15001,14 +15368,10 @@ define void @s_shuffle_v4i64_v3i64__5_5_4_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v4i64.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v4i64.ll
index ea9ef2f1ac94a..aaaadb95351c0 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v4i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v4i64.ll
@@ -87,6 +87,7 @@ define void @v_shuffle_v4i64_v4i64__1_u_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -126,6 +127,7 @@ define void @v_shuffle_v4i64_v4i64__2_u_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -165,10 +167,12 @@ define void @v_shuffle_v4i64_v4i64__3_u_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v4i64_v4i64__3_u_u_u:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_mov_b32_e32 v8, 0
+; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1] offset:16
+; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_mov_b32_e32 v0, v6
 ; GFX942-NEXT:    v_mov_b32_e32 v1, v7
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
@@ -221,6 +225,7 @@ define void @v_shuffle_v4i64_v4i64__5_u_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -261,6 +266,7 @@ define void @v_shuffle_v4i64_v4i64__6_u_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -301,10 +307,12 @@ define void @v_shuffle_v4i64_v4i64__7_u_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v4i64_v4i64__7_u_u_u:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_mov_b32_e32 v8, 0
+; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1] offset:16
+; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_mov_b32_e32 v0, v6
 ; GFX942-NEXT:    v_mov_b32_e32 v1, v7
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
@@ -364,11 +372,11 @@ define void @v_shuffle_v4i64_v4i64__7_0_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[2:9]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v8
-; GFX942-NEXT:    v_mov_b32_e32 v3, v9
+; GFX942-NEXT:    global_store_dwordx4 v10, v[0:3], s[0:1] offset:16
 ; GFX942-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX942-NEXT:    v_mov_b32_e32 v5, v1
+; GFX942-NEXT:    v_mov_b32_e32 v2, v8
+; GFX942-NEXT:    v_mov_b32_e32 v3, v9
 ; GFX942-NEXT:    global_store_dwordx4 v10, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -415,14 +423,15 @@ define void @v_shuffle_v4i64_v4i64__7_1_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v4i64_v4i64__7_1_u_u:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v12, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_mov_b32_e32 v12, 0
+; GFX942-NEXT:    global_store_dwordx4 v12, v[0:3], s[0:1] offset:16
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_mov_b32_e32 v0, v10
 ; GFX942-NEXT:    v_mov_b32_e32 v1, v11
 ; GFX942-NEXT:    global_store_dwordx4 v12, v[0:3], s[0:1]
@@ -471,14 +480,15 @@ define void @v_shuffle_v4i64_v4i64__7_2_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v4i64_v4i64__7_2_u_u:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v14, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_mov_b32_e32 v14, 0
+; GFX942-NEXT:    global_store_dwordx4 v14, v[0:3], s[0:1] offset:16
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[6:13]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_mov_b32_e32 v2, v12
 ; GFX942-NEXT:    v_mov_b32_e32 v3, v13
 ; GFX942-NEXT:    global_store_dwordx4 v14, v[2:5], s[0:1]
@@ -534,7 +544,7 @@ define void @v_shuffle_v4i64_v4i64__7_3_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    global_store_dwordx4 v16, v[0:3], s[0:1] offset:16
 ; GFX942-NEXT:    v_mov_b32_e32 v4, v14
 ; GFX942-NEXT:    v_mov_b32_e32 v5, v15
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1]
@@ -581,14 +591,15 @@ define void @v_shuffle_v4i64_v4i64__7_4_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v4i64_v4i64__7_4_u_u:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_mov_b32_e32 v8, 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v6
-; GFX942-NEXT:    v_mov_b32_e32 v3, v7
+; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1] offset:16
 ; GFX942-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX942-NEXT:    v_mov_b32_e32 v5, v1
+; GFX942-NEXT:    v_mov_b32_e32 v2, v6
+; GFX942-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -629,10 +640,12 @@ define void @v_shuffle_v4i64_v4i64__7_5_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v4i64_v4i64__7_5_u_u:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_mov_b32_e32 v8, 0
+; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1] offset:16
+; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_mov_b32_e32 v0, v6
 ; GFX942-NEXT:    v_mov_b32_e32 v1, v7
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
@@ -675,10 +688,12 @@ define void @v_shuffle_v4i64_v4i64__7_6_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v4i64_v4i64__7_6_u_u:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_mov_b32_e32 v8, 0
+; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1] offset:16
+; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_mov_b32_e32 v2, v6
 ; GFX942-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[2:5], s[0:1]
@@ -727,6 +742,7 @@ define void @v_shuffle_v4i64_v4i64__7_7_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-NEXT:    v_mov_b32_e32 v4, v6
 ; GFX942-NEXT:    v_mov_b32_e32 v5, v7
+; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -7703,6 +7719,7 @@ define void @v_shuffle_v4i64_v4i64__1_4_4_4(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -7742,6 +7759,7 @@ define void @v_shuffle_v4i64_v4i64__2_4_4_4(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -7781,10 +7799,12 @@ define void @v_shuffle_v4i64_v4i64__3_4_4_4(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v4i64_v4i64__3_4_4_4:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_mov_b32_e32 v8, 0
+; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1] offset:16
+; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_mov_b32_e32 v0, v6
 ; GFX942-NEXT:    v_mov_b32_e32 v1, v7
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
@@ -13289,8 +13309,7 @@ define void @s_shuffle_v4i64_v4i64__1_u_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -13330,8 +13349,7 @@ define void @s_shuffle_v4i64_v4i64__2_u_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -13375,8 +13393,7 @@ define void @s_shuffle_v4i64_v4i64__3_u_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -13434,8 +13451,7 @@ define void @s_shuffle_v4i64_v4i64__5_u_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -13476,8 +13492,7 @@ define void @s_shuffle_v4i64_v4i64__6_u_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -13522,8 +13537,7 @@ define void @s_shuffle_v4i64_v4i64__7_u_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -13582,10 +13596,8 @@ define void @s_shuffle_v4i64_v4i64__7_0_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -13639,8 +13651,7 @@ define void @s_shuffle_v4i64_v4i64__7_1_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -13698,10 +13709,8 @@ define void @s_shuffle_v4i64_v4i64__7_2_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -13755,10 +13764,8 @@ define void @s_shuffle_v4i64_v4i64__7_3_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -13807,10 +13814,8 @@ define void @s_shuffle_v4i64_v4i64__7_4_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -13823,18 +13828,43 @@ define void @s_shuffle_v4i64_v4i64__7_4_u_u() {
 }
 
 define void @s_shuffle_v4i64_v4i64__7_5_u_u() {
-; GFX9-LABEL: s_shuffle_v4i64_v4i64__7_5_u_u:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_5_u_u:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_5_u_u:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_5_u_u:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 5, i32 poison, i32 poison>
@@ -13879,10 +13909,8 @@ define void @s_shuffle_v4i64_v4i64__7_6_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -13927,10 +13955,8 @@ define void @s_shuffle_v4i64_v4i64__7_7_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -13989,13 +14015,11 @@ define void @s_shuffle_v4i64_v4i64__7_7_0_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s0
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14054,13 +14078,11 @@ define void @s_shuffle_v4i64_v4i64__7_7_1_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s2
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14114,10 +14136,8 @@ define void @s_shuffle_v4i64_v4i64__7_7_2_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14175,12 +14195,9 @@ define void @s_shuffle_v4i64_v4i64__7_7_3_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14229,12 +14246,9 @@ define void @s_shuffle_v4i64_v4i64__7_7_4_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14283,12 +14297,9 @@ define void @s_shuffle_v4i64_v4i64__7_7_5_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14301,20 +14312,48 @@ define void @s_shuffle_v4i64_v4i64__7_7_5_u() {
 }
 
 define void @s_shuffle_v4i64_v4i64__7_7_6_u() {
-; GFX9-LABEL: s_shuffle_v4i64_v4i64__7_7_6_u:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    s_mov_b32 s10, s14
-; GFX9-NEXT:    s_mov_b32 s11, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_6_u:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_6_u:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_6_u:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 6, i32 poison>
@@ -14363,12 +14402,9 @@ define void @s_shuffle_v4i64_v4i64__7_7_7_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14434,14 +14470,10 @@ define void @s_shuffle_v4i64_v4i64__7_7_7_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14507,14 +14539,10 @@ define void @s_shuffle_v4i64_v4i64__7_7_7_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14580,14 +14608,10 @@ define void @s_shuffle_v4i64_v4i64__7_7_7_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14649,12 +14673,9 @@ define void @s_shuffle_v4i64_v4i64__7_7_7_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14711,14 +14732,10 @@ define void @s_shuffle_v4i64_v4i64__7_7_7_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14771,14 +14788,10 @@ define void @s_shuffle_v4i64_v4i64__7_7_7_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14835,14 +14848,10 @@ define void @s_shuffle_v4i64_v4i64__7_7_7_6() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14855,22 +14864,53 @@ define void @s_shuffle_v4i64_v4i64__7_7_7_6() {
 }
 
 define void @s_shuffle_v4i64_v4i64__7_7_7_7() {
-; GFX9-LABEL: s_shuffle_v4i64_v4i64__7_7_7_7:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    s_mov_b32 s10, s14
-; GFX9-NEXT:    s_mov_b32 s11, s15
-; GFX9-NEXT:    s_mov_b32 s12, s14
-; GFX9-NEXT:    s_mov_b32 s13, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_7_7:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_7_7:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_7_7:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
@@ -14919,12 +14959,9 @@ define void @s_shuffle_v4i64_v4i64__u_0_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14936,22 +14973,53 @@ define void @s_shuffle_v4i64_v4i64__u_0_0_0() {
 }
 
 define void @s_shuffle_v4i64_v4i64__0_0_0_0() {
-; GFX9-LABEL: s_shuffle_v4i64_v4i64__0_0_0_0:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s8
-; GFX9-NEXT:    s_mov_b32 s11, s9
-; GFX9-NEXT:    s_mov_b32 s12, s8
-; GFX9-NEXT:    s_mov_b32 s13, s9
-; GFX9-NEXT:    s_mov_b32 s14, s8
-; GFX9-NEXT:    s_mov_b32 s15, s9
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__0_0_0_0:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s8
+; GFX900-NEXT:    s_mov_b32 s11, s9
+; GFX900-NEXT:    s_mov_b32 s12, s8
+; GFX900-NEXT:    s_mov_b32 s13, s9
+; GFX900-NEXT:    s_mov_b32 s14, s8
+; GFX900-NEXT:    s_mov_b32 s15, s9
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__0_0_0_0:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s8
+; GFX90A-NEXT:    s_mov_b32 s11, s9
+; GFX90A-NEXT:    s_mov_b32 s12, s8
+; GFX90A-NEXT:    s_mov_b32 s13, s9
+; GFX90A-NEXT:    s_mov_b32 s14, s8
+; GFX90A-NEXT:    s_mov_b32 s15, s9
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__0_0_0_0:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[8:9]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[8:9]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[8:9]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> zeroinitializer
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
@@ -15003,14 +15071,10 @@ define void @s_shuffle_v4i64_v4i64__1_0_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -15062,14 +15126,10 @@ define void @s_shuffle_v4i64_v4i64__2_0_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -15125,14 +15185,10 @@ define void @s_shuffle_v4i64_v4i64__3_0_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -15184,12 +15240,9 @@ define void @s_shuffle_v4i64_v4i64__4_0_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -15251,17 +15304,13 @@ define void @s_shuffle_v4i64_v4i64__5_0_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s0
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -15324,15 +15373,12 @@ define void @s_shuffle_v4i64_v4i64__6_0_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s0
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -15395,17 +15441,13 @@ define void @s_shuffle_v4i64_v4i64__7_0_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s0
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -15464,15 +15506,12 @@ define void @s_shuffle_v4i64_v4i64__7_u_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s0
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -15535,17 +15574,13 @@ define void @s_shuffle_v4i64_v4i64__7_1_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s0
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s10, s2
-; GFX942-NEXT:    s_mov_b32 s11, s3
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -15611,14 +15646,10 @@ define void @s_shuffle_v4i64_v4i64__7_2_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -15680,14 +15711,10 @@ define void @s_shuffle_v4i64_v4i64__7_3_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -15750,17 +15777,13 @@ define void @s_shuffle_v4i64_v4i64__7_4_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s0
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -15822,12 +15845,9 @@ define void @s_shuffle_v4i64_v4i64__7_5_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -15893,14 +15913,10 @@ define void @s_shuffle_v4i64_v4i64__7_6_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -15963,15 +15979,12 @@ define void @s_shuffle_v4i64_v4i64__7_7_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s0
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -16030,13 +16043,11 @@ define void @s_shuffle_v4i64_v4i64__7_7_u_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s14, s0
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -16099,15 +16110,12 @@ define void @s_shuffle_v4i64_v4i64__7_7_1_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s2
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -16169,14 +16177,10 @@ define void @s_shuffle_v4i64_v4i64__7_7_2_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
-; GFX942-NEXT:    s_mov_b32 s12, s4
-; GFX942-NEXT:    s_mov_b32 s13, s5
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -16238,14 +16242,10 @@ define void @s_shuffle_v4i64_v4i64__7_7_3_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -16304,15 +16304,12 @@ define void @s_shuffle_v4i64_v4i64__7_7_4_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s14, s0
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s12, s4
-; GFX942-NEXT:    s_mov_b32 s13, s5
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -16375,15 +16372,12 @@ define void @s_shuffle_v4i64_v4i64__7_7_5_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s14, s0
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -16445,12 +16439,9 @@ define void @s_shuffle_v4i64_v4i64__7_7_6_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -16463,149 +16454,12 @@ define void @s_shuffle_v4i64_v4i64__7_7_6_0() {
 }
 
 define void @s_shuffle_v4i64_v4i64__u_1_1_1() {
-; GFX9-LABEL: s_shuffle_v4i64_v4i64__u_1_1_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    s_mov_b32 s14, s10
-; GFX9-NEXT:    s_mov_b32 s15, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> <i32 poison, i32 1, i32 1, i32 1>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4i64_v4i64__0_1_1_1() {
-; GFX9-LABEL: s_shuffle_v4i64_v4i64__0_1_1_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    s_mov_b32 s14, s10
-; GFX9-NEXT:    s_mov_b32 s15, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> <i32 0, i32 1, i32 1, i32 1>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4i64_v4i64__1_1_1_1() {
-; GFX9-LABEL: s_shuffle_v4i64_v4i64__1_1_1_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    s_mov_b32 s14, s10
-; GFX9-NEXT:    s_mov_b32 s15, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4i64_v4i64__2_1_1_1() {
-; GFX9-LABEL: s_shuffle_v4i64_v4i64__2_1_1_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s12
-; GFX9-NEXT:    s_mov_b32 s9, s13
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    s_mov_b32 s14, s10
-; GFX9-NEXT:    s_mov_b32 s15, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> <i32 2, i32 1, i32 1, i32 1>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4i64_v4i64__3_1_1_1() {
-; GFX9-LABEL: s_shuffle_v4i64_v4i64__3_1_1_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    s_mov_b32 s14, s10
-; GFX9-NEXT:    s_mov_b32 s15, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> <i32 3, i32 1, i32 1, i32 1>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4i64_v4i64__4_1_1_1() {
-; GFX9-LABEL: s_shuffle_v4i64_v4i64__4_1_1_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    s_mov_b32 s14, s10
-; GFX9-NEXT:    s_mov_b32 s15, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> <i32 4, i32 1, i32 1, i32 1>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4i64_v4i64__5_1_1_1() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__5_1_1_1:
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__u_1_1_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s14
-; GFX900-NEXT:    s_mov_b32 s9, s15
 ; GFX900-NEXT:    s_mov_b32 s12, s10
 ; GFX900-NEXT:    s_mov_b32 s13, s11
 ; GFX900-NEXT:    s_mov_b32 s14, s10
@@ -16615,17 +16469,12 @@ define void @s_shuffle_v4i64_v4i64__5_1_1_1() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__5_1_1_1:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__u_1_1_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s14
-; GFX90A-NEXT:    s_mov_b32 s9, s15
 ; GFX90A-NEXT:    s_mov_b32 s12, s10
 ; GFX90A-NEXT:    s_mov_b32 s13, s11
 ; GFX90A-NEXT:    s_mov_b32 s14, s10
@@ -16635,44 +16484,31 @@ define void @s_shuffle_v4i64_v4i64__5_1_1_1() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__5_1_1_1:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__u_1_1_1:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s12, s10
-; GFX942-NEXT:    s_mov_b32 s13, s11
-; GFX942-NEXT:    s_mov_b32 s14, s10
-; GFX942-NEXT:    s_mov_b32 s15, s11
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 5, i32 1, i32 1, i32 1>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> <i32 poison, i32 1, i32 1, i32 1>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__6_1_1_1() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__6_1_1_1:
+define void @s_shuffle_v4i64_v4i64__0_1_1_1() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__0_1_1_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s16
-; GFX900-NEXT:    s_mov_b32 s9, s17
 ; GFX900-NEXT:    s_mov_b32 s12, s10
 ; GFX900-NEXT:    s_mov_b32 s13, s11
 ; GFX900-NEXT:    s_mov_b32 s14, s10
@@ -16682,17 +16518,12 @@ define void @s_shuffle_v4i64_v4i64__6_1_1_1() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__6_1_1_1:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__0_1_1_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s16
-; GFX90A-NEXT:    s_mov_b32 s9, s17
 ; GFX90A-NEXT:    s_mov_b32 s12, s10
 ; GFX90A-NEXT:    s_mov_b32 s13, s11
 ; GFX90A-NEXT:    s_mov_b32 s14, s10
@@ -16702,44 +16533,33 @@ define void @s_shuffle_v4i64_v4i64__6_1_1_1() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__6_1_1_1:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__0_1_1_1:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s12, s10
-; GFX942-NEXT:    s_mov_b32 s13, s11
-; GFX942-NEXT:    s_mov_b32 s14, s10
-; GFX942-NEXT:    s_mov_b32 s15, s11
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 6, i32 1, i32 1, i32 1>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> <i32 0, i32 1, i32 1, i32 1>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__7_1_1_1() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_1_1_1:
+define void @s_shuffle_v4i64_v4i64__1_1_1_1() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__1_1_1_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s18
-; GFX900-NEXT:    s_mov_b32 s9, s19
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
 ; GFX900-NEXT:    s_mov_b32 s12, s10
 ; GFX900-NEXT:    s_mov_b32 s13, s11
 ; GFX900-NEXT:    s_mov_b32 s14, s10
@@ -16749,17 +16569,14 @@ define void @s_shuffle_v4i64_v4i64__7_1_1_1() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_1_1_1:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__1_1_1_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s18
-; GFX90A-NEXT:    s_mov_b32 s9, s19
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
 ; GFX90A-NEXT:    s_mov_b32 s12, s10
 ; GFX90A-NEXT:    s_mov_b32 s13, s11
 ; GFX90A-NEXT:    s_mov_b32 s14, s10
@@ -16769,283 +16586,224 @@ define void @s_shuffle_v4i64_v4i64__7_1_1_1() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_1_1_1:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__1_1_1_1:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s12, s10
-; GFX942-NEXT:    s_mov_b32 s13, s11
-; GFX942-NEXT:    s_mov_b32 s14, s10
-; GFX942-NEXT:    s_mov_b32 s15, s11
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 1, i32 1, i32 1>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__7_u_1_1() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_u_1_1:
+define void @s_shuffle_v4i64_v4i64__2_1_1_1() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__2_1_1_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s14
-; GFX900-NEXT:    s_mov_b32 s9, s15
-; GFX900-NEXT:    s_mov_b32 s12, s6
-; GFX900-NEXT:    s_mov_b32 s13, s7
-; GFX900-NEXT:    s_mov_b32 s14, s6
-; GFX900-NEXT:    s_mov_b32 s15, s7
+; GFX900-NEXT:    s_mov_b32 s8, s12
+; GFX900-NEXT:    s_mov_b32 s9, s13
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    s_mov_b32 s14, s10
+; GFX900-NEXT:    s_mov_b32 s15, s11
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_u_1_1:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__2_1_1_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s14
-; GFX90A-NEXT:    s_mov_b32 s9, s15
-; GFX90A-NEXT:    s_mov_b32 s12, s6
-; GFX90A-NEXT:    s_mov_b32 s13, s7
-; GFX90A-NEXT:    s_mov_b32 s14, s6
-; GFX90A-NEXT:    s_mov_b32 s15, s7
+; GFX90A-NEXT:    s_mov_b32 s8, s12
+; GFX90A-NEXT:    s_mov_b32 s9, s13
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s10
+; GFX90A-NEXT:    s_mov_b32 s15, s11
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_u_1_1:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__2_1_1_1:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[4:11]
+; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 poison, i32 1, i32 1>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> <i32 2, i32 1, i32 1, i32 1>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__7_0_1_1() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_0_1_1:
+define void @s_shuffle_v4i64_v4i64__3_1_1_1() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__3_1_1_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s14
 ; GFX900-NEXT:    s_mov_b32 s9, s15
-; GFX900-NEXT:    s_mov_b32 s10, s4
-; GFX900-NEXT:    s_mov_b32 s11, s5
-; GFX900-NEXT:    s_mov_b32 s12, s6
-; GFX900-NEXT:    s_mov_b32 s13, s7
-; GFX900-NEXT:    s_mov_b32 s14, s6
-; GFX900-NEXT:    s_mov_b32 s15, s7
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    s_mov_b32 s14, s10
+; GFX900-NEXT:    s_mov_b32 s15, s11
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_0_1_1:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__3_1_1_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s14
 ; GFX90A-NEXT:    s_mov_b32 s9, s15
-; GFX90A-NEXT:    s_mov_b32 s10, s4
-; GFX90A-NEXT:    s_mov_b32 s11, s5
-; GFX90A-NEXT:    s_mov_b32 s12, s6
-; GFX90A-NEXT:    s_mov_b32 s13, s7
-; GFX90A-NEXT:    s_mov_b32 s14, s6
-; GFX90A-NEXT:    s_mov_b32 s15, s7
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s10
+; GFX90A-NEXT:    s_mov_b32 s15, s11
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_0_1_1:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__3_1_1_1:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[4:11]
+; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 0, i32 1, i32 1>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> <i32 3, i32 1, i32 1, i32 1>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__7_2_1_1() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_2_1_1:
+define void @s_shuffle_v4i64_v4i64__4_1_1_1() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__4_1_1_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s10, s16
-; GFX900-NEXT:    s_mov_b32 s11, s17
-; GFX900-NEXT:    s_mov_b32 s12, s14
-; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    s_mov_b32 s14, s10
+; GFX900-NEXT:    s_mov_b32 s15, s11
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_2_1_1:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__4_1_1_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s10, s16
-; GFX90A-NEXT:    s_mov_b32 s11, s17
-; GFX90A-NEXT:    s_mov_b32 s12, s14
-; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s10
+; GFX90A-NEXT:    s_mov_b32 s15, s11
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_2_1_1:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__4_1_1_1:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 2, i32 1, i32 1>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> <i32 4, i32 1, i32 1, i32 1>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__7_3_1_1() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_3_1_1:
+define void @s_shuffle_v4i64_v4i64__5_1_1_1() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__5_1_1_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s18
-; GFX900-NEXT:    s_mov_b32 s9, s19
-; GFX900-NEXT:    s_mov_b32 s12, s6
-; GFX900-NEXT:    s_mov_b32 s13, s7
-; GFX900-NEXT:    s_mov_b32 s14, s6
-; GFX900-NEXT:    s_mov_b32 s15, s7
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    s_mov_b32 s14, s10
+; GFX900-NEXT:    s_mov_b32 s15, s11
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_3_1_1:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__5_1_1_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s18
-; GFX90A-NEXT:    s_mov_b32 s9, s19
-; GFX90A-NEXT:    s_mov_b32 s12, s6
-; GFX90A-NEXT:    s_mov_b32 s13, s7
-; GFX90A-NEXT:    s_mov_b32 s14, s6
-; GFX90A-NEXT:    s_mov_b32 s15, s7
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s10
+; GFX90A-NEXT:    s_mov_b32 s15, s11
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_3_1_1:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__5_1_1_1:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -17054,140 +16812,126 @@ define void @s_shuffle_v4i64_v4i64__7_3_1_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 3, i32 1, i32 1>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 5, i32 1, i32 1, i32 1>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__7_4_1_1() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_4_1_1:
+define void @s_shuffle_v4i64_v4i64__6_1_1_1() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__6_1_1_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s18
-; GFX900-NEXT:    s_mov_b32 s9, s19
-; GFX900-NEXT:    s_mov_b32 s10, s12
-; GFX900-NEXT:    s_mov_b32 s11, s13
-; GFX900-NEXT:    s_mov_b32 s12, s6
-; GFX900-NEXT:    s_mov_b32 s13, s7
-; GFX900-NEXT:    s_mov_b32 s14, s6
-; GFX900-NEXT:    s_mov_b32 s15, s7
+; GFX900-NEXT:    s_mov_b32 s8, s16
+; GFX900-NEXT:    s_mov_b32 s9, s17
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    s_mov_b32 s14, s10
+; GFX900-NEXT:    s_mov_b32 s15, s11
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_4_1_1:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__6_1_1_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s18
-; GFX90A-NEXT:    s_mov_b32 s9, s19
-; GFX90A-NEXT:    s_mov_b32 s10, s12
-; GFX90A-NEXT:    s_mov_b32 s11, s13
-; GFX90A-NEXT:    s_mov_b32 s12, s6
-; GFX90A-NEXT:    s_mov_b32 s13, s7
-; GFX90A-NEXT:    s_mov_b32 s14, s6
-; GFX90A-NEXT:    s_mov_b32 s15, s7
+; GFX90A-NEXT:    s_mov_b32 s8, s16
+; GFX90A-NEXT:    s_mov_b32 s9, s17
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s10
+; GFX90A-NEXT:    s_mov_b32 s15, s11
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_4_1_1:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__6_1_1_1:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s2
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[4:11]
+; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 4, i32 1, i32 1>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 6, i32 1, i32 1, i32 1>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__7_5_1_1() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_5_1_1:
+define void @s_shuffle_v4i64_v4i64__7_1_1_1() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_1_1_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s14
-; GFX900-NEXT:    s_mov_b32 s9, s15
-; GFX900-NEXT:    s_mov_b32 s12, s6
-; GFX900-NEXT:    s_mov_b32 s13, s7
-; GFX900-NEXT:    s_mov_b32 s14, s6
-; GFX900-NEXT:    s_mov_b32 s15, s7
+; GFX900-NEXT:    s_mov_b32 s8, s18
+; GFX900-NEXT:    s_mov_b32 s9, s19
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    s_mov_b32 s14, s10
+; GFX900-NEXT:    s_mov_b32 s15, s11
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_5_1_1:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_1_1_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s14
-; GFX90A-NEXT:    s_mov_b32 s9, s15
-; GFX90A-NEXT:    s_mov_b32 s12, s6
-; GFX90A-NEXT:    s_mov_b32 s13, s7
-; GFX90A-NEXT:    s_mov_b32 s14, s6
-; GFX90A-NEXT:    s_mov_b32 s15, s7
+; GFX90A-NEXT:    s_mov_b32 s8, s18
+; GFX90A-NEXT:    s_mov_b32 s9, s19
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s10
+; GFX90A-NEXT:    s_mov_b32 s15, s11
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_5_1_1:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_1_1_1:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -17196,25 +16940,22 @@ define void @s_shuffle_v4i64_v4i64__7_5_1_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 5, i32 1, i32 1>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 1, i32 1, i32 1>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__7_6_1_1() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_6_1_1:
+define void @s_shuffle_v4i64_v4i64__7_u_1_1() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_u_1_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -17225,8 +16966,6 @@ define void @s_shuffle_v4i64_v4i64__7_6_1_1() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s14
 ; GFX900-NEXT:    s_mov_b32 s9, s15
-; GFX900-NEXT:    s_mov_b32 s10, s12
-; GFX900-NEXT:    s_mov_b32 s11, s13
 ; GFX900-NEXT:    s_mov_b32 s12, s6
 ; GFX900-NEXT:    s_mov_b32 s13, s7
 ; GFX900-NEXT:    s_mov_b32 s14, s6
@@ -17236,7 +16975,7 @@ define void @s_shuffle_v4i64_v4i64__7_6_1_1() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_6_1_1:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_u_1_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -17247,8 +16986,6 @@ define void @s_shuffle_v4i64_v4i64__7_6_1_1() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s14
 ; GFX90A-NEXT:    s_mov_b32 s9, s15
-; GFX90A-NEXT:    s_mov_b32 s10, s12
-; GFX90A-NEXT:    s_mov_b32 s11, s13
 ; GFX90A-NEXT:    s_mov_b32 s12, s6
 ; GFX90A-NEXT:    s_mov_b32 s13, s7
 ; GFX90A-NEXT:    s_mov_b32 s14, s6
@@ -17258,36 +16995,31 @@ define void @s_shuffle_v4i64_v4i64__7_6_1_1() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_6_1_1:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_u_1_1:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 6, i32 1, i32 1>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 poison, i32 1, i32 1>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__7_7_1_1() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_1_1:
+define void @s_shuffle_v4i64_v4i64__7_0_1_1() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_0_1_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -17298,8 +17030,8 @@ define void @s_shuffle_v4i64_v4i64__7_7_1_1() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s14
 ; GFX900-NEXT:    s_mov_b32 s9, s15
-; GFX900-NEXT:    s_mov_b32 s10, s14
-; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    s_mov_b32 s10, s4
+; GFX900-NEXT:    s_mov_b32 s11, s5
 ; GFX900-NEXT:    s_mov_b32 s12, s6
 ; GFX900-NEXT:    s_mov_b32 s13, s7
 ; GFX900-NEXT:    s_mov_b32 s14, s6
@@ -17309,7 +17041,7 @@ define void @s_shuffle_v4i64_v4i64__7_7_1_1() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_1_1:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_0_1_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -17320,8 +17052,8 @@ define void @s_shuffle_v4i64_v4i64__7_7_1_1() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s14
 ; GFX90A-NEXT:    s_mov_b32 s9, s15
-; GFX90A-NEXT:    s_mov_b32 s10, s14
-; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    s_mov_b32 s10, s4
+; GFX90A-NEXT:    s_mov_b32 s11, s5
 ; GFX90A-NEXT:    s_mov_b32 s12, s6
 ; GFX90A-NEXT:    s_mov_b32 s13, s7
 ; GFX90A-NEXT:    s_mov_b32 s14, s6
@@ -17331,113 +17063,109 @@ define void @s_shuffle_v4i64_v4i64__7_7_1_1() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_1_1:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_0_1_1:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s2
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 1, i32 1>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 0, i32 1, i32 1>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__7_7_u_1() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_u_1:
+define void @s_shuffle_v4i64_v4i64__7_2_1_1() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_2_1_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s14
-; GFX900-NEXT:    s_mov_b32 s9, s15
-; GFX900-NEXT:    s_mov_b32 s10, s14
-; GFX900-NEXT:    s_mov_b32 s11, s15
-; GFX900-NEXT:    s_mov_b32 s14, s6
-; GFX900-NEXT:    s_mov_b32 s15, s7
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s10, s16
+; GFX900-NEXT:    s_mov_b32 s11, s17
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_u_1:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_2_1_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s14
-; GFX90A-NEXT:    s_mov_b32 s9, s15
-; GFX90A-NEXT:    s_mov_b32 s10, s14
-; GFX90A-NEXT:    s_mov_b32 s11, s15
-; GFX90A-NEXT:    s_mov_b32 s14, s6
-; GFX90A-NEXT:    s_mov_b32 s15, s7
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s10, s16
+; GFX90A-NEXT:    s_mov_b32 s11, s17
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_u_1:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_2_1_1:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s14, s2
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[4:11]
+; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 poison, i32 1>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 2, i32 1, i32 1>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__7_7_0_1() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_0_1:
+define void @s_shuffle_v4i64_v4i64__7_3_1_1() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_3_1_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s14
-; GFX900-NEXT:    s_mov_b32 s9, s15
-; GFX900-NEXT:    s_mov_b32 s10, s14
-; GFX900-NEXT:    s_mov_b32 s11, s15
-; GFX900-NEXT:    s_mov_b32 s12, s4
-; GFX900-NEXT:    s_mov_b32 s13, s5
+; GFX900-NEXT:    s_mov_b32 s8, s18
+; GFX900-NEXT:    s_mov_b32 s9, s19
+; GFX900-NEXT:    s_mov_b32 s12, s6
+; GFX900-NEXT:    s_mov_b32 s13, s7
 ; GFX900-NEXT:    s_mov_b32 s14, s6
 ; GFX900-NEXT:    s_mov_b32 s15, s7
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -17445,21 +17173,19 @@ define void @s_shuffle_v4i64_v4i64__7_7_0_1() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_0_1:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_3_1_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s14
-; GFX90A-NEXT:    s_mov_b32 s9, s15
-; GFX90A-NEXT:    s_mov_b32 s10, s14
-; GFX90A-NEXT:    s_mov_b32 s11, s15
-; GFX90A-NEXT:    s_mov_b32 s12, s4
-; GFX90A-NEXT:    s_mov_b32 s13, s5
+; GFX90A-NEXT:    s_mov_b32 s8, s18
+; GFX90A-NEXT:    s_mov_b32 s9, s19
+; GFX90A-NEXT:    s_mov_b32 s12, s6
+; GFX90A-NEXT:    s_mov_b32 s13, s7
 ; GFX90A-NEXT:    s_mov_b32 s14, s6
 ; GFX90A-NEXT:    s_mov_b32 s15, s7
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -17467,135 +17193,141 @@ define void @s_shuffle_v4i64_v4i64__7_7_0_1() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_0_1:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_3_1_1:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s0
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[4:11]
+; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 0, i32 1>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 3, i32 1, i32 1>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__7_7_2_1() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_2_1:
+define void @s_shuffle_v4i64_v4i64__7_4_1_1() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_4_1_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s12, s16
-; GFX900-NEXT:    s_mov_b32 s13, s17
+; GFX900-NEXT:    s_mov_b32 s8, s18
+; GFX900-NEXT:    s_mov_b32 s9, s19
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s12, s6
+; GFX900-NEXT:    s_mov_b32 s13, s7
+; GFX900-NEXT:    s_mov_b32 s14, s6
+; GFX900-NEXT:    s_mov_b32 s15, s7
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_2_1:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_4_1_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s12, s16
-; GFX90A-NEXT:    s_mov_b32 s13, s17
+; GFX90A-NEXT:    s_mov_b32 s8, s18
+; GFX90A-NEXT:    s_mov_b32 s9, s19
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s12, s6
+; GFX90A-NEXT:    s_mov_b32 s13, s7
+; GFX90A-NEXT:    s_mov_b32 s14, s6
+; GFX90A-NEXT:    s_mov_b32 s15, s7
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_2_1:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_4_1_1:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
-; GFX942-NEXT:    s_mov_b32 s12, s4
-; GFX942-NEXT:    s_mov_b32 s13, s5
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 2, i32 1>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 4, i32 1, i32 1>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__7_7_3_1() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_3_1:
+define void @s_shuffle_v4i64_v4i64__7_5_1_1() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_5_1_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s12, s18
-; GFX900-NEXT:    s_mov_b32 s13, s19
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s12, s6
+; GFX900-NEXT:    s_mov_b32 s13, s7
+; GFX900-NEXT:    s_mov_b32 s14, s6
+; GFX900-NEXT:    s_mov_b32 s15, s7
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_3_1:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_5_1_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s12, s18
-; GFX90A-NEXT:    s_mov_b32 s13, s19
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s6
+; GFX90A-NEXT:    s_mov_b32 s13, s7
+; GFX90A-NEXT:    s_mov_b32 s14, s6
+; GFX90A-NEXT:    s_mov_b32 s15, s7
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_3_1:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_5_1_1:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -17604,39 +17336,36 @@ define void @s_shuffle_v4i64_v4i64__7_7_3_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 3, i32 1>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 5, i32 1, i32 1>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__7_7_4_1() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_4_1:
+define void @s_shuffle_v4i64_v4i64__7_6_1_1() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_6_1_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s18
-; GFX900-NEXT:    s_mov_b32 s9, s19
-; GFX900-NEXT:    s_mov_b32 s10, s18
-; GFX900-NEXT:    s_mov_b32 s11, s19
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s12, s6
+; GFX900-NEXT:    s_mov_b32 s13, s7
 ; GFX900-NEXT:    s_mov_b32 s14, s6
 ; GFX900-NEXT:    s_mov_b32 s15, s7
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -17644,19 +17373,21 @@ define void @s_shuffle_v4i64_v4i64__7_7_4_1() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_4_1:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_6_1_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s18
-; GFX90A-NEXT:    s_mov_b32 s9, s19
-; GFX90A-NEXT:    s_mov_b32 s10, s18
-; GFX90A-NEXT:    s_mov_b32 s11, s19
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s12, s6
+; GFX90A-NEXT:    s_mov_b32 s13, s7
 ; GFX90A-NEXT:    s_mov_b32 s14, s6
 ; GFX90A-NEXT:    s_mov_b32 s15, s7
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -17664,48 +17395,46 @@ define void @s_shuffle_v4i64_v4i64__7_7_4_1() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_4_1:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_6_1_1:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s14, s2
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[4:11]
+; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s12, s4
-; GFX942-NEXT:    s_mov_b32 s13, s5
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 4, i32 1>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 6, i32 1, i32 1>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__7_7_5_1() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_5_1:
+define void @s_shuffle_v4i64_v4i64__7_7_1_1() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_1_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s18
-; GFX900-NEXT:    s_mov_b32 s9, s19
-; GFX900-NEXT:    s_mov_b32 s10, s18
-; GFX900-NEXT:    s_mov_b32 s11, s19
-; GFX900-NEXT:    s_mov_b32 s12, s14
-; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    s_mov_b32 s12, s6
+; GFX900-NEXT:    s_mov_b32 s13, s7
 ; GFX900-NEXT:    s_mov_b32 s14, s6
 ; GFX900-NEXT:    s_mov_b32 s15, s7
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -17713,21 +17442,21 @@ define void @s_shuffle_v4i64_v4i64__7_7_5_1() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_5_1:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_1_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s18
-; GFX90A-NEXT:    s_mov_b32 s9, s19
-; GFX90A-NEXT:    s_mov_b32 s10, s18
-; GFX90A-NEXT:    s_mov_b32 s11, s19
-; GFX90A-NEXT:    s_mov_b32 s12, s14
-; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s6
+; GFX90A-NEXT:    s_mov_b32 s13, s7
 ; GFX90A-NEXT:    s_mov_b32 s14, s6
 ; GFX90A-NEXT:    s_mov_b32 s15, s7
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -17735,34 +17464,31 @@ define void @s_shuffle_v4i64_v4i64__7_7_5_1() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_5_1:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_1_1:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s14, s2
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 5, i32 1>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 1, i32 1>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__7_7_6_1() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_6_1:
+define void @s_shuffle_v4i64_v4i64__7_7_u_1() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_u_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -17782,7 +17508,7 @@ define void @s_shuffle_v4i64_v4i64__7_7_6_1() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_6_1:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_u_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -17802,269 +17528,134 @@ define void @s_shuffle_v4i64_v4i64__7_7_6_1() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_6_1:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_u_1:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 6, i32 1>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4i64_v4i64__u_2_2_2() {
-; GFX9-LABEL: s_shuffle_v4i64_v4i64__u_2_2_2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    s_mov_b32 s14, s12
-; GFX9-NEXT:    s_mov_b32 s15, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> <i32 poison, i32 2, i32 2, i32 2>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4i64_v4i64__0_2_2_2() {
-; GFX9-LABEL: s_shuffle_v4i64_v4i64__0_2_2_2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    s_mov_b32 s14, s12
-; GFX9-NEXT:    s_mov_b32 s15, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 2, i32 2>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4i64_v4i64__1_2_2_2() {
-; GFX9-LABEL: s_shuffle_v4i64_v4i64__1_2_2_2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    s_mov_b32 s14, s12
-; GFX9-NEXT:    s_mov_b32 s15, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> <i32 1, i32 2, i32 2, i32 2>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4i64_v4i64__2_2_2_2() {
-; GFX9-LABEL: s_shuffle_v4i64_v4i64__2_2_2_2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s12
-; GFX9-NEXT:    s_mov_b32 s9, s13
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    s_mov_b32 s14, s12
-; GFX9-NEXT:    s_mov_b32 s15, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4i64_v4i64__3_2_2_2() {
-; GFX9-LABEL: s_shuffle_v4i64_v4i64__3_2_2_2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    s_mov_b32 s14, s12
-; GFX9-NEXT:    s_mov_b32 s15, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 2, i32 2>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4i64_v4i64__4_2_2_2() {
-; GFX9-LABEL: s_shuffle_v4i64_v4i64__4_2_2_2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    s_mov_b32 s14, s12
-; GFX9-NEXT:    s_mov_b32 s15, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> <i32 4, i32 2, i32 2, i32 2>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 poison, i32 1>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__5_2_2_2() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__5_2_2_2:
+define void @s_shuffle_v4i64_v4i64__7_7_0_1() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_0_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s6
-; GFX900-NEXT:    s_mov_b32 s9, s7
-; GFX900-NEXT:    s_mov_b32 s10, s12
-; GFX900-NEXT:    s_mov_b32 s11, s13
-; GFX900-NEXT:    s_mov_b32 s14, s12
-; GFX900-NEXT:    s_mov_b32 s15, s13
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    s_mov_b32 s12, s4
+; GFX900-NEXT:    s_mov_b32 s13, s5
+; GFX900-NEXT:    s_mov_b32 s14, s6
+; GFX900-NEXT:    s_mov_b32 s15, s7
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__5_2_2_2:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_0_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s6
-; GFX90A-NEXT:    s_mov_b32 s9, s7
-; GFX90A-NEXT:    s_mov_b32 s10, s12
-; GFX90A-NEXT:    s_mov_b32 s11, s13
-; GFX90A-NEXT:    s_mov_b32 s14, s12
-; GFX90A-NEXT:    s_mov_b32 s15, s13
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s4
+; GFX90A-NEXT:    s_mov_b32 s13, s5
+; GFX90A-NEXT:    s_mov_b32 s14, s6
+; GFX90A-NEXT:    s_mov_b32 s15, s7
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__5_2_2_2:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_0_1:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s14, s12
-; GFX942-NEXT:    s_mov_b32 s15, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 5, i32 2, i32 2, i32 2>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 0, i32 1>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__6_2_2_2() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__6_2_2_2:
+define void @s_shuffle_v4i64_v4i64__7_7_2_1() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_2_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s10, s12
-; GFX900-NEXT:    s_mov_b32 s11, s13
-; GFX900-NEXT:    s_mov_b32 s14, s12
-; GFX900-NEXT:    s_mov_b32 s15, s13
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s12, s16
+; GFX900-NEXT:    s_mov_b32 s13, s17
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__6_2_2_2:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_2_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s10, s12
-; GFX90A-NEXT:    s_mov_b32 s11, s13
-; GFX90A-NEXT:    s_mov_b32 s14, s12
-; GFX90A-NEXT:    s_mov_b32 s15, s13
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s12, s16
+; GFX90A-NEXT:    s_mov_b32 s13, s17
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__6_2_2_2:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_2_1:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -18073,65 +17664,59 @@ define void @s_shuffle_v4i64_v4i64__6_2_2_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s14, s12
-; GFX942-NEXT:    s_mov_b32 s15, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 6, i32 2, i32 2, i32 2>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 2, i32 1>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__7_2_2_2() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_2_2_2:
+define void @s_shuffle_v4i64_v4i64__7_7_3_1() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_3_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s10
 ; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s10, s12
-; GFX900-NEXT:    s_mov_b32 s11, s13
-; GFX900-NEXT:    s_mov_b32 s14, s12
-; GFX900-NEXT:    s_mov_b32 s15, s13
+; GFX900-NEXT:    s_mov_b32 s12, s18
+; GFX900-NEXT:    s_mov_b32 s13, s19
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_2_2_2:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_3_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s10
 ; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s10, s12
-; GFX90A-NEXT:    s_mov_b32 s11, s13
-; GFX90A-NEXT:    s_mov_b32 s14, s12
-; GFX90A-NEXT:    s_mov_b32 s15, s13
+; GFX90A-NEXT:    s_mov_b32 s12, s18
+; GFX90A-NEXT:    s_mov_b32 s13, s19
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_2_2_2:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_3_1:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -18140,195 +17725,195 @@ define void @s_shuffle_v4i64_v4i64__7_2_2_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s14, s12
-; GFX942-NEXT:    s_mov_b32 s15, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 2, i32 2, i32 2>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 3, i32 1>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__7_u_2_2() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_u_2_2:
+define void @s_shuffle_v4i64_v4i64__7_7_4_1() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_4_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s14, s12
-; GFX900-NEXT:    s_mov_b32 s15, s13
+; GFX900-NEXT:    s_mov_b32 s8, s18
+; GFX900-NEXT:    s_mov_b32 s9, s19
+; GFX900-NEXT:    s_mov_b32 s10, s18
+; GFX900-NEXT:    s_mov_b32 s11, s19
+; GFX900-NEXT:    s_mov_b32 s14, s6
+; GFX900-NEXT:    s_mov_b32 s15, s7
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_u_2_2:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_4_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s14, s12
-; GFX90A-NEXT:    s_mov_b32 s15, s13
+; GFX90A-NEXT:    s_mov_b32 s8, s18
+; GFX90A-NEXT:    s_mov_b32 s9, s19
+; GFX90A-NEXT:    s_mov_b32 s10, s18
+; GFX90A-NEXT:    s_mov_b32 s11, s19
+; GFX90A-NEXT:    s_mov_b32 s14, s6
+; GFX90A-NEXT:    s_mov_b32 s15, s7
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_u_2_2:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_4_1:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s14, s12
-; GFX942-NEXT:    s_mov_b32 s15, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 poison, i32 2, i32 2>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 4, i32 1>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__7_0_2_2() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_0_2_2:
+define void @s_shuffle_v4i64_v4i64__7_7_5_1() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_5_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s10, s12
-; GFX900-NEXT:    s_mov_b32 s11, s13
-; GFX900-NEXT:    s_mov_b32 s12, s16
-; GFX900-NEXT:    s_mov_b32 s13, s17
-; GFX900-NEXT:    s_mov_b32 s14, s16
-; GFX900-NEXT:    s_mov_b32 s15, s17
+; GFX900-NEXT:    s_mov_b32 s8, s18
+; GFX900-NEXT:    s_mov_b32 s9, s19
+; GFX900-NEXT:    s_mov_b32 s10, s18
+; GFX900-NEXT:    s_mov_b32 s11, s19
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    s_mov_b32 s14, s6
+; GFX900-NEXT:    s_mov_b32 s15, s7
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_0_2_2:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_5_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s10, s12
-; GFX90A-NEXT:    s_mov_b32 s11, s13
-; GFX90A-NEXT:    s_mov_b32 s12, s16
-; GFX90A-NEXT:    s_mov_b32 s13, s17
-; GFX90A-NEXT:    s_mov_b32 s14, s16
-; GFX90A-NEXT:    s_mov_b32 s15, s17
+; GFX90A-NEXT:    s_mov_b32 s8, s18
+; GFX90A-NEXT:    s_mov_b32 s9, s19
+; GFX90A-NEXT:    s_mov_b32 s10, s18
+; GFX90A-NEXT:    s_mov_b32 s11, s19
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    s_mov_b32 s14, s6
+; GFX90A-NEXT:    s_mov_b32 s15, s7
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_0_2_2:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_5_1:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s4
-; GFX942-NEXT:    s_mov_b32 s13, s5
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 0, i32 2, i32 2>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 5, i32 1>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__7_1_2_2() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_1_2_2:
+define void @s_shuffle_v4i64_v4i64__7_7_6_1() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_6_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[16:23]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s22
-; GFX900-NEXT:    s_mov_b32 s9, s23
-; GFX900-NEXT:    s_mov_b32 s14, s12
-; GFX900-NEXT:    s_mov_b32 s15, s13
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    s_mov_b32 s14, s6
+; GFX900-NEXT:    s_mov_b32 s15, s7
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_1_2_2:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_6_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[16:23]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s22
-; GFX90A-NEXT:    s_mov_b32 s9, s23
-; GFX90A-NEXT:    s_mov_b32 s14, s12
-; GFX90A-NEXT:    s_mov_b32 s15, s13
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    s_mov_b32 s14, s6
+; GFX90A-NEXT:    s_mov_b32 s15, s7
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_1_2_2:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_6_1:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -18337,35 +17922,29 @@ define void @s_shuffle_v4i64_v4i64__7_1_2_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s14, s12
-; GFX942-NEXT:    s_mov_b32 s15, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 1, i32 2, i32 2>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 6, i32 1>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__7_3_2_2() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_3_2_2:
+define void @s_shuffle_v4i64_v4i64__u_2_2_2() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__u_2_2_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s10, s14
-; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
 ; GFX900-NEXT:    s_mov_b32 s14, s12
 ; GFX900-NEXT:    s_mov_b32 s15, s13
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -18373,19 +17952,14 @@ define void @s_shuffle_v4i64_v4i64__7_3_2_2() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_3_2_2:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__u_2_2_2:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s10, s14
-; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
 ; GFX90A-NEXT:    s_mov_b32 s14, s12
 ; GFX90A-NEXT:    s_mov_b32 s15, s13
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -18393,46 +17967,33 @@ define void @s_shuffle_v4i64_v4i64__7_3_2_2() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_3_2_2:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__u_2_2_2:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
-; GFX942-NEXT:    s_mov_b32 s14, s12
-; GFX942-NEXT:    s_mov_b32 s15, s13
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 3, i32 2, i32 2>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> <i32 poison, i32 2, i32 2, i32 2>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__7_4_2_2() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_4_2_2:
+define void @s_shuffle_v4i64_v4i64__0_2_2_2() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__0_2_2_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s10, s4
-; GFX900-NEXT:    s_mov_b32 s11, s5
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
 ; GFX900-NEXT:    s_mov_b32 s14, s12
 ; GFX900-NEXT:    s_mov_b32 s15, s13
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -18440,19 +18001,14 @@ define void @s_shuffle_v4i64_v4i64__7_4_2_2() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_4_2_2:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__0_2_2_2:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s10, s4
-; GFX90A-NEXT:    s_mov_b32 s11, s5
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
 ; GFX90A-NEXT:    s_mov_b32 s14, s12
 ; GFX90A-NEXT:    s_mov_b32 s15, s13
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -18460,113 +18016,89 @@ define void @s_shuffle_v4i64_v4i64__7_4_2_2() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_4_2_2:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__0_2_2_2:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s14, s12
-; GFX942-NEXT:    s_mov_b32 s15, s13
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 4, i32 2, i32 2>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 2, i32 2>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__7_5_2_2() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_5_2_2:
+define void @s_shuffle_v4i64_v4i64__1_2_2_2() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__1_2_2_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s14
-; GFX900-NEXT:    s_mov_b32 s9, s15
-; GFX900-NEXT:    s_mov_b32 s12, s16
-; GFX900-NEXT:    s_mov_b32 s13, s17
-; GFX900-NEXT:    s_mov_b32 s14, s16
-; GFX900-NEXT:    s_mov_b32 s15, s17
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_5_2_2:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__1_2_2_2:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s14
-; GFX90A-NEXT:    s_mov_b32 s9, s15
-; GFX90A-NEXT:    s_mov_b32 s12, s16
-; GFX90A-NEXT:    s_mov_b32 s13, s17
-; GFX90A-NEXT:    s_mov_b32 s14, s16
-; GFX90A-NEXT:    s_mov_b32 s15, s17
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_5_2_2:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__1_2_2_2:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s12, s4
-; GFX942-NEXT:    s_mov_b32 s13, s5
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 5, i32 2, i32 2>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> <i32 1, i32 2, i32 2, i32 2>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__7_6_2_2() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_6_2_2:
+define void @s_shuffle_v4i64_v4i64__2_2_2_2() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__2_2_2_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[16:23]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s22
-; GFX900-NEXT:    s_mov_b32 s9, s23
-; GFX900-NEXT:    s_mov_b32 s10, s20
-; GFX900-NEXT:    s_mov_b32 s11, s21
+; GFX900-NEXT:    s_mov_b32 s8, s12
+; GFX900-NEXT:    s_mov_b32 s9, s13
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
 ; GFX900-NEXT:    s_mov_b32 s14, s12
 ; GFX900-NEXT:    s_mov_b32 s15, s13
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -18574,19 +18106,16 @@ define void @s_shuffle_v4i64_v4i64__7_6_2_2() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_6_2_2:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__2_2_2_2:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[16:23]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s22
-; GFX90A-NEXT:    s_mov_b32 s9, s23
-; GFX90A-NEXT:    s_mov_b32 s10, s20
-; GFX90A-NEXT:    s_mov_b32 s11, s21
+; GFX90A-NEXT:    s_mov_b32 s8, s12
+; GFX90A-NEXT:    s_mov_b32 s9, s13
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
 ; GFX90A-NEXT:    s_mov_b32 s14, s12
 ; GFX90A-NEXT:    s_mov_b32 s15, s13
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -18594,44 +18123,36 @@ define void @s_shuffle_v4i64_v4i64__7_6_2_2() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_6_2_2:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__2_2_2_2:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s14, s12
-; GFX942-NEXT:    s_mov_b32 s15, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 6, i32 2, i32 2>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__7_7_2_2() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_2_2:
+define void @s_shuffle_v4i64_v4i64__3_2_2_2() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__3_2_2_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
 ; GFX900-NEXT:    s_mov_b32 s14, s12
 ; GFX900-NEXT:    s_mov_b32 s15, s13
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -18639,17 +18160,16 @@ define void @s_shuffle_v4i64_v4i64__7_7_2_2() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_2_2:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__3_2_2_2:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
 ; GFX90A-NEXT:    s_mov_b32 s14, s12
 ; GFX90A-NEXT:    s_mov_b32 s15, s13
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -18657,44 +18177,34 @@ define void @s_shuffle_v4i64_v4i64__7_7_2_2() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_2_2:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__3_2_2_2:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s14, s12
-; GFX942-NEXT:    s_mov_b32 s15, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 2, i32 2>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 2, i32 2>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__7_7_u_2() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_u_2:
+define void @s_shuffle_v4i64_v4i64__4_2_2_2() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__4_2_2_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
 ; GFX900-NEXT:    s_mov_b32 s14, s12
 ; GFX900-NEXT:    s_mov_b32 s15, s13
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -18702,17 +18212,14 @@ define void @s_shuffle_v4i64_v4i64__7_7_u_2() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_u_2:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__4_2_2_2:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
 ; GFX90A-NEXT:    s_mov_b32 s14, s12
 ; GFX90A-NEXT:    s_mov_b32 s15, s13
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -18720,70 +18227,66 @@ define void @s_shuffle_v4i64_v4i64__7_7_u_2() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_u_2:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__4_2_2_2:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 poison, i32 2>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> <i32 4, i32 2, i32 2, i32 2>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__7_7_0_2() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_0_2:
+define void @s_shuffle_v4i64_v4i64__5_2_2_2() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__5_2_2_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s14, s16
-; GFX900-NEXT:    s_mov_b32 s15, s17
+; GFX900-NEXT:    s_mov_b32 s8, s6
+; GFX900-NEXT:    s_mov_b32 s9, s7
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_0_2:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__5_2_2_2:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s14, s16
-; GFX90A-NEXT:    s_mov_b32 s15, s17
+; GFX90A-NEXT:    s_mov_b32 s8, s6
+; GFX90A-NEXT:    s_mov_b32 s9, s7
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_0_2:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__5_2_2_2:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -18792,67 +18295,58 @@ define void @s_shuffle_v4i64_v4i64__7_7_0_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 0, i32 2>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 5, i32 2, i32 2, i32 2>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__7_7_1_2() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_1_2:
+define void @s_shuffle_v4i64_v4i64__6_2_2_2() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__6_2_2_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s12, s14
-; GFX900-NEXT:    s_mov_b32 s13, s15
-; GFX900-NEXT:    s_mov_b32 s14, s16
-; GFX900-NEXT:    s_mov_b32 s15, s17
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_1_2:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__6_2_2_2:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s12, s14
-; GFX90A-NEXT:    s_mov_b32 s13, s15
-; GFX90A-NEXT:    s_mov_b32 s14, s16
-; GFX90A-NEXT:    s_mov_b32 s15, s17
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_1_2:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__6_2_2_2:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -18861,67 +18355,62 @@ define void @s_shuffle_v4i64_v4i64__7_7_1_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 1, i32 2>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 6, i32 2, i32 2, i32 2>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__7_7_3_2() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_3_2:
+define void @s_shuffle_v4i64_v4i64__7_2_2_2() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_2_2_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s10
 ; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s12, s18
-; GFX900-NEXT:    s_mov_b32 s13, s19
-; GFX900-NEXT:    s_mov_b32 s14, s16
-; GFX900-NEXT:    s_mov_b32 s15, s17
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_3_2:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_2_2_2:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s10
 ; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s12, s18
-; GFX90A-NEXT:    s_mov_b32 s13, s19
-; GFX90A-NEXT:    s_mov_b32 s14, s16
-; GFX90A-NEXT:    s_mov_b32 s15, s17
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_3_2:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_2_2_2:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -18930,94 +18419,81 @@ define void @s_shuffle_v4i64_v4i64__7_7_3_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 3, i32 2>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 2, i32 2, i32 2>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__7_7_4_2() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_4_2:
+define void @s_shuffle_v4i64_v4i64__7_u_2_2() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_u_2_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s10
 ; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s12, s4
-; GFX900-NEXT:    s_mov_b32 s13, s5
-; GFX900-NEXT:    s_mov_b32 s14, s16
-; GFX900-NEXT:    s_mov_b32 s15, s17
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_4_2:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_u_2_2:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s10
 ; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s12, s4
-; GFX90A-NEXT:    s_mov_b32 s13, s5
-; GFX90A-NEXT:    s_mov_b32 s14, s16
-; GFX90A-NEXT:    s_mov_b32 s15, s17
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_4_2:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_u_2_2:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[12:19]
+; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s18
-; GFX942-NEXT:    s_mov_b32 s9, s19
-; GFX942-NEXT:    s_mov_b32 s10, s18
-; GFX942-NEXT:    s_mov_b32 s11, s19
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 4, i32 2>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 poison, i32 2, i32 2>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__7_7_5_2() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_5_2:
+define void @s_shuffle_v4i64_v4i64__7_0_2_2() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_0_2_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -19028,8 +18504,10 @@ define void @s_shuffle_v4i64_v4i64__7_7_5_2() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s10
 ; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s12, s6
-; GFX900-NEXT:    s_mov_b32 s13, s7
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s12, s16
+; GFX900-NEXT:    s_mov_b32 s13, s17
 ; GFX900-NEXT:    s_mov_b32 s14, s16
 ; GFX900-NEXT:    s_mov_b32 s15, s17
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -19037,7 +18515,7 @@ define void @s_shuffle_v4i64_v4i64__7_7_5_2() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_5_2:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_0_2_2:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -19048,8 +18526,10 @@ define void @s_shuffle_v4i64_v4i64__7_7_5_2() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s10
 ; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s12, s6
-; GFX90A-NEXT:    s_mov_b32 s13, s7
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s12, s16
+; GFX90A-NEXT:    s_mov_b32 s13, s17
 ; GFX90A-NEXT:    s_mov_b32 s14, s16
 ; GFX90A-NEXT:    s_mov_b32 s15, s17
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -19057,76 +18537,68 @@ define void @s_shuffle_v4i64_v4i64__7_7_5_2() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_5_2:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_0_2_2:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[12:19]
+; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s18
-; GFX942-NEXT:    s_mov_b32 s9, s19
-; GFX942-NEXT:    s_mov_b32 s10, s18
-; GFX942-NEXT:    s_mov_b32 s11, s19
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 5, i32 2>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 0, i32 2, i32 2>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__7_7_6_2() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_6_2:
+define void @s_shuffle_v4i64_v4i64__7_1_2_2() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_1_2_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ; def s[16:23]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s14
-; GFX900-NEXT:    s_mov_b32 s9, s15
-; GFX900-NEXT:    s_mov_b32 s10, s14
-; GFX900-NEXT:    s_mov_b32 s11, s15
-; GFX900-NEXT:    s_mov_b32 s14, s16
-; GFX900-NEXT:    s_mov_b32 s15, s17
+; GFX900-NEXT:    s_mov_b32 s8, s22
+; GFX900-NEXT:    s_mov_b32 s9, s23
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_6_2:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_1_2_2:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ; def s[16:23]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s14
-; GFX90A-NEXT:    s_mov_b32 s9, s15
-; GFX90A-NEXT:    s_mov_b32 s10, s14
-; GFX90A-NEXT:    s_mov_b32 s11, s15
-; GFX90A-NEXT:    s_mov_b32 s14, s16
-; GFX90A-NEXT:    s_mov_b32 s15, s17
+; GFX90A-NEXT:    s_mov_b32 s8, s22
+; GFX90A-NEXT:    s_mov_b32 s9, s23
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_6_2:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_1_2_2:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -19135,157 +18607,21 @@ define void @s_shuffle_v4i64_v4i64__7_7_6_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 6, i32 2>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4i64_v4i64__u_3_3_3() {
-; GFX9-LABEL: s_shuffle_v4i64_v4i64__u_3_3_3:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s14
-; GFX9-NEXT:    s_mov_b32 s11, s15
-; GFX9-NEXT:    s_mov_b32 s12, s14
-; GFX9-NEXT:    s_mov_b32 s13, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> <i32 poison, i32 3, i32 3, i32 3>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4i64_v4i64__0_3_3_3() {
-; GFX9-LABEL: s_shuffle_v4i64_v4i64__0_3_3_3:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s14
-; GFX9-NEXT:    s_mov_b32 s11, s15
-; GFX9-NEXT:    s_mov_b32 s12, s14
-; GFX9-NEXT:    s_mov_b32 s13, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> <i32 0, i32 3, i32 3, i32 3>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4i64_v4i64__1_3_3_3() {
-; GFX9-LABEL: s_shuffle_v4i64_v4i64__1_3_3_3:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    s_mov_b32 s10, s14
-; GFX9-NEXT:    s_mov_b32 s11, s15
-; GFX9-NEXT:    s_mov_b32 s12, s14
-; GFX9-NEXT:    s_mov_b32 s13, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> <i32 1, i32 3, i32 3, i32 3>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4i64_v4i64__2_3_3_3() {
-; GFX9-LABEL: s_shuffle_v4i64_v4i64__2_3_3_3:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s12
-; GFX9-NEXT:    s_mov_b32 s9, s13
-; GFX9-NEXT:    s_mov_b32 s10, s14
-; GFX9-NEXT:    s_mov_b32 s11, s15
-; GFX9-NEXT:    s_mov_b32 s12, s14
-; GFX9-NEXT:    s_mov_b32 s13, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> <i32 2, i32 3, i32 3, i32 3>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4i64_v4i64__3_3_3_3() {
-; GFX9-LABEL: s_shuffle_v4i64_v4i64__3_3_3_3:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    s_mov_b32 s10, s14
-; GFX9-NEXT:    s_mov_b32 s11, s15
-; GFX9-NEXT:    s_mov_b32 s12, s14
-; GFX9-NEXT:    s_mov_b32 s13, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4i64_v4i64__4_3_3_3() {
-; GFX9-LABEL: s_shuffle_v4i64_v4i64__4_3_3_3:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s14
-; GFX9-NEXT:    s_mov_b32 s11, s15
-; GFX9-NEXT:    s_mov_b32 s12, s14
-; GFX9-NEXT:    s_mov_b32 s13, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> <i32 4, i32 3, i32 3, i32 3>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 1, i32 2, i32 2>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__5_3_3_3() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__5_3_3_3:
+define void @s_shuffle_v4i64_v4i64__7_3_2_2() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_3_2_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -19294,18 +18630,18 @@ define void @s_shuffle_v4i64_v4i64__5_3_3_3() {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s6
-; GFX900-NEXT:    s_mov_b32 s9, s7
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
 ; GFX900-NEXT:    s_mov_b32 s10, s14
 ; GFX900-NEXT:    s_mov_b32 s11, s15
-; GFX900-NEXT:    s_mov_b32 s12, s14
-; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__5_3_3_3:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_3_2_2:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -19314,18 +18650,18 @@ define void @s_shuffle_v4i64_v4i64__5_3_3_3() {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s6
-; GFX90A-NEXT:    s_mov_b32 s9, s7
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
 ; GFX90A-NEXT:    s_mov_b32 s10, s14
 ; GFX90A-NEXT:    s_mov_b32 s11, s15
-; GFX90A-NEXT:    s_mov_b32 s12, s14
-; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__5_3_3_3:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_3_2_2:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -19334,25 +18670,22 @@ define void @s_shuffle_v4i64_v4i64__5_3_3_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 5, i32 3, i32 3, i32 3>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 3, i32 2, i32 2>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__6_3_3_3() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__6_3_3_3:
+define void @s_shuffle_v4i64_v4i64__7_4_2_2() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_4_2_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -19361,16 +18694,18 @@ define void @s_shuffle_v4i64_v4i64__6_3_3_3() {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s10, s14
-; GFX900-NEXT:    s_mov_b32 s11, s15
-; GFX900-NEXT:    s_mov_b32 s12, s14
-; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s10, s4
+; GFX900-NEXT:    s_mov_b32 s11, s5
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__6_3_3_3:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_4_2_2:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -19379,16 +18714,18 @@ define void @s_shuffle_v4i64_v4i64__6_3_3_3() {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s10, s14
-; GFX90A-NEXT:    s_mov_b32 s11, s15
-; GFX90A-NEXT:    s_mov_b32 s12, s14
-; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s10, s4
+; GFX90A-NEXT:    s_mov_b32 s11, s5
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__6_3_3_3:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_4_2_2:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -19397,65 +18734,62 @@ define void @s_shuffle_v4i64_v4i64__6_3_3_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 6, i32 3, i32 3, i32 3>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 4, i32 2, i32 2>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__7_3_3_3() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_3_3_3:
+define void @s_shuffle_v4i64_v4i64__7_5_2_2() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_5_2_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s10, s14
-; GFX900-NEXT:    s_mov_b32 s11, s15
-; GFX900-NEXT:    s_mov_b32 s12, s14
-; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s12, s16
+; GFX900-NEXT:    s_mov_b32 s13, s17
+; GFX900-NEXT:    s_mov_b32 s14, s16
+; GFX900-NEXT:    s_mov_b32 s15, s17
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_3_3_3:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_5_2_2:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s10, s14
-; GFX90A-NEXT:    s_mov_b32 s11, s15
-; GFX90A-NEXT:    s_mov_b32 s12, s14
-; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s16
+; GFX90A-NEXT:    s_mov_b32 s13, s17
+; GFX90A-NEXT:    s_mov_b32 s14, s16
+; GFX90A-NEXT:    s_mov_b32 s15, s17
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_3_3_3:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_5_2_2:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -19464,61 +18798,62 @@ define void @s_shuffle_v4i64_v4i64__7_3_3_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 3, i32 3, i32 3>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 5, i32 2, i32 2>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__7_u_3_3() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_u_3_3:
+define void @s_shuffle_v4i64_v4i64__7_6_2_2() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_6_2_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ; def s[16:23]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s12, s14
-; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    s_mov_b32 s8, s22
+; GFX900-NEXT:    s_mov_b32 s9, s23
+; GFX900-NEXT:    s_mov_b32 s10, s20
+; GFX900-NEXT:    s_mov_b32 s11, s21
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_u_3_3:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_6_2_2:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; def s[16:23]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s12, s14
-; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    s_mov_b32 s8, s22
+; GFX90A-NEXT:    s_mov_b32 s9, s23
+; GFX90A-NEXT:    s_mov_b32 s10, s20
+; GFX90A-NEXT:    s_mov_b32 s11, s21
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_u_3_3:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_6_2_2:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -19527,67 +18862,58 @@ define void @s_shuffle_v4i64_v4i64__7_u_3_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 poison, i32 3, i32 3>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 6, i32 2, i32 2>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__7_0_3_3() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_0_3_3:
+define void @s_shuffle_v4i64_v4i64__7_7_2_2() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_2_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s10
 ; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s10, s12
-; GFX900-NEXT:    s_mov_b32 s11, s13
-; GFX900-NEXT:    s_mov_b32 s12, s18
-; GFX900-NEXT:    s_mov_b32 s13, s19
-; GFX900-NEXT:    s_mov_b32 s14, s18
-; GFX900-NEXT:    s_mov_b32 s15, s19
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_0_3_3:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_2_2:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s10
 ; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s10, s12
-; GFX90A-NEXT:    s_mov_b32 s11, s13
-; GFX90A-NEXT:    s_mov_b32 s12, s18
-; GFX90A-NEXT:    s_mov_b32 s13, s19
-; GFX90A-NEXT:    s_mov_b32 s14, s18
-; GFX90A-NEXT:    s_mov_b32 s15, s19
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_0_3_3:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_2_2:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -19596,63 +18922,58 @@ define void @s_shuffle_v4i64_v4i64__7_0_3_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
-; GFX942-NEXT:    s_mov_b32 s14, s6
-; GFX942-NEXT:    s_mov_b32 s15, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 0, i32 3, i32 3>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 2, i32 2>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__7_1_3_3() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_1_3_3:
+define void @s_shuffle_v4i64_v4i64__7_7_u_2() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_u_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[16:23]
+; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s22
-; GFX900-NEXT:    s_mov_b32 s9, s23
-; GFX900-NEXT:    s_mov_b32 s12, s14
-; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_1_3_3:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_u_2:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[16:23]
+; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s22
-; GFX90A-NEXT:    s_mov_b32 s9, s23
-; GFX90A-NEXT:    s_mov_b32 s12, s14
-; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_1_3_3:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_u_2:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -19661,63 +18982,58 @@ define void @s_shuffle_v4i64_v4i64__7_1_3_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 1, i32 3, i32 3>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 poison, i32 2>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__7_2_3_3() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_2_3_3:
+define void @s_shuffle_v4i64_v4i64__7_7_0_2() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_0_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s10
 ; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s10, s12
-; GFX900-NEXT:    s_mov_b32 s11, s13
-; GFX900-NEXT:    s_mov_b32 s12, s14
-; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    s_mov_b32 s14, s16
+; GFX900-NEXT:    s_mov_b32 s15, s17
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_2_3_3:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_0_2:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s10
 ; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s10, s12
-; GFX90A-NEXT:    s_mov_b32 s11, s13
-; GFX90A-NEXT:    s_mov_b32 s12, s14
-; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    s_mov_b32 s14, s16
+; GFX90A-NEXT:    s_mov_b32 s15, s17
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_2_3_3:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_0_2:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -19726,65 +19042,63 @@ define void @s_shuffle_v4i64_v4i64__7_2_3_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 2, i32 3, i32 3>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 0, i32 2>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__7_4_3_3() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_4_3_3:
+define void @s_shuffle_v4i64_v4i64__7_7_1_2() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_1_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s10
 ; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s10, s4
-; GFX900-NEXT:    s_mov_b32 s11, s5
 ; GFX900-NEXT:    s_mov_b32 s12, s14
 ; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    s_mov_b32 s14, s16
+; GFX900-NEXT:    s_mov_b32 s15, s17
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_4_3_3:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_1_2:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s10
 ; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s10, s4
-; GFX90A-NEXT:    s_mov_b32 s11, s5
 ; GFX90A-NEXT:    s_mov_b32 s12, s14
 ; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    s_mov_b32 s14, s16
+; GFX90A-NEXT:    s_mov_b32 s15, s17
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_4_3_3:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_1_2:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -19793,65 +19107,63 @@ define void @s_shuffle_v4i64_v4i64__7_4_3_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 4, i32 3, i32 3>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 1, i32 2>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__7_5_3_3() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_5_3_3:
+define void @s_shuffle_v4i64_v4i64__7_7_3_2() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_3_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s14
-; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
 ; GFX900-NEXT:    s_mov_b32 s12, s18
 ; GFX900-NEXT:    s_mov_b32 s13, s19
-; GFX900-NEXT:    s_mov_b32 s14, s18
-; GFX900-NEXT:    s_mov_b32 s15, s19
+; GFX900-NEXT:    s_mov_b32 s14, s16
+; GFX900-NEXT:    s_mov_b32 s15, s17
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_5_3_3:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_3_2:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s14
-; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
 ; GFX90A-NEXT:    s_mov_b32 s12, s18
 ; GFX90A-NEXT:    s_mov_b32 s13, s19
-; GFX90A-NEXT:    s_mov_b32 s14, s18
-; GFX90A-NEXT:    s_mov_b32 s15, s19
+; GFX90A-NEXT:    s_mov_b32 s14, s16
+; GFX90A-NEXT:    s_mov_b32 s15, s17
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_5_3_3:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_3_2:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -19860,187 +19172,192 @@ define void @s_shuffle_v4i64_v4i64__7_5_3_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
-; GFX942-NEXT:    s_mov_b32 s14, s6
-; GFX942-NEXT:    s_mov_b32 s15, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 5, i32 3, i32 3>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 3, i32 2>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__7_6_3_3() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_6_3_3:
+define void @s_shuffle_v4i64_v4i64__7_7_4_2() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_4_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[16:23]
+; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s22
-; GFX900-NEXT:    s_mov_b32 s9, s23
-; GFX900-NEXT:    s_mov_b32 s10, s20
-; GFX900-NEXT:    s_mov_b32 s11, s21
-; GFX900-NEXT:    s_mov_b32 s12, s14
-; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s12, s4
+; GFX900-NEXT:    s_mov_b32 s13, s5
+; GFX900-NEXT:    s_mov_b32 s14, s16
+; GFX900-NEXT:    s_mov_b32 s15, s17
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_6_3_3:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_4_2:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[16:23]
+; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s22
-; GFX90A-NEXT:    s_mov_b32 s9, s23
-; GFX90A-NEXT:    s_mov_b32 s10, s20
-; GFX90A-NEXT:    s_mov_b32 s11, s21
-; GFX90A-NEXT:    s_mov_b32 s12, s14
-; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s12, s4
+; GFX90A-NEXT:    s_mov_b32 s13, s5
+; GFX90A-NEXT:    s_mov_b32 s14, s16
+; GFX90A-NEXT:    s_mov_b32 s15, s17
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_6_3_3:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_4_2:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ; def s[12:19]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[18:19]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[18:19]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 6, i32 3, i32 3>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 4, i32 2>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__7_7_3_3() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_3_3:
+define void @s_shuffle_v4i64_v4i64__7_7_5_2() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_5_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s10
 ; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s12, s14
-; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    s_mov_b32 s12, s6
+; GFX900-NEXT:    s_mov_b32 s13, s7
+; GFX900-NEXT:    s_mov_b32 s14, s16
+; GFX900-NEXT:    s_mov_b32 s15, s17
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_3_3:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_5_2:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s10
 ; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s12, s14
-; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s6
+; GFX90A-NEXT:    s_mov_b32 s13, s7
+; GFX90A-NEXT:    s_mov_b32 s14, s16
+; GFX90A-NEXT:    s_mov_b32 s15, s17
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_3_3:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_5_2:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ; def s[12:19]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[18:19]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[18:19]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 3, i32 3>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 5, i32 2>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__7_7_u_3() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_u_3:
+define void @s_shuffle_v4i64_v4i64__7_7_6_2() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_6_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    s_mov_b32 s14, s16
+; GFX900-NEXT:    s_mov_b32 s15, s17
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_u_3:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_6_2:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    s_mov_b32 s14, s16
+; GFX90A-NEXT:    s_mov_b32 s15, s17
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_u_3:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_6_2:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -20049,926 +19366,898 @@ define void @s_shuffle_v4i64_v4i64__7_7_u_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 poison, i32 3>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 6, i32 2>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__7_7_0_3() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_0_3:
+define void @s_shuffle_v4i64_v4i64__u_3_3_3() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__u_3_3_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s14, s18
-; GFX900-NEXT:    s_mov_b32 s15, s19
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_0_3:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__u_3_3_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s14, s18
-; GFX90A-NEXT:    s_mov_b32 s15, s19
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_0_3:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__u_3_3_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s6
-; GFX942-NEXT:    s_mov_b32 s15, s7
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 0, i32 3>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> <i32 poison, i32 3, i32 3, i32 3>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__7_7_1_3() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_1_3:
+define void @s_shuffle_v4i64_v4i64__0_3_3_3() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__0_3_3_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
 ; GFX900-NEXT:    s_mov_b32 s12, s14
 ; GFX900-NEXT:    s_mov_b32 s13, s15
-; GFX900-NEXT:    s_mov_b32 s14, s18
-; GFX900-NEXT:    s_mov_b32 s15, s19
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_1_3:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__0_3_3_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
 ; GFX90A-NEXT:    s_mov_b32 s12, s14
 ; GFX90A-NEXT:    s_mov_b32 s13, s15
-; GFX90A-NEXT:    s_mov_b32 s14, s18
-; GFX90A-NEXT:    s_mov_b32 s15, s19
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_1_3:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__0_3_3_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s6
-; GFX942-NEXT:    s_mov_b32 s15, s7
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 1, i32 3>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> <i32 0, i32 3, i32 3, i32 3>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__7_7_2_3() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_2_3:
+define void @s_shuffle_v4i64_v4i64__1_3_3_3() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__1_3_3_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
-; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s10
 ; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_2_3:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__1_3_3_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
-; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s10
 ; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_2_3:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__1_3_3_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 2, i32 3>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> <i32 1, i32 3, i32 3, i32 3>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__7_7_4_3() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_4_3:
+define void @s_shuffle_v4i64_v4i64__2_3_3_3() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__2_3_3_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s12, s4
-; GFX900-NEXT:    s_mov_b32 s13, s5
+; GFX900-NEXT:    s_mov_b32 s8, s12
+; GFX900-NEXT:    s_mov_b32 s9, s13
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_4_3:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__2_3_3_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s12, s4
-; GFX90A-NEXT:    s_mov_b32 s13, s5
+; GFX90A-NEXT:    s_mov_b32 s8, s12
+; GFX90A-NEXT:    s_mov_b32 s9, s13
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_4_3:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__2_3_3_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 4, i32 3>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> <i32 2, i32 3, i32 3, i32 3>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__7_7_5_3() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_5_3:
+define void @s_shuffle_v4i64_v4i64__3_3_3_3() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__3_3_3_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s12, s6
-; GFX900-NEXT:    s_mov_b32 s13, s7
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_5_3:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__3_3_3_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s12, s6
-; GFX90A-NEXT:    s_mov_b32 s13, s7
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_5_3:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__3_3_3_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 5, i32 3>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__7_7_6_3() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_6_3:
+define void @s_shuffle_v4i64_v4i64__4_3_3_3() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__4_3_3_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s14
-; GFX900-NEXT:    s_mov_b32 s9, s15
 ; GFX900-NEXT:    s_mov_b32 s10, s14
 ; GFX900-NEXT:    s_mov_b32 s11, s15
-; GFX900-NEXT:    s_mov_b32 s14, s18
-; GFX900-NEXT:    s_mov_b32 s15, s19
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_6_3:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__4_3_3_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s14
-; GFX90A-NEXT:    s_mov_b32 s9, s15
 ; GFX90A-NEXT:    s_mov_b32 s10, s14
 ; GFX90A-NEXT:    s_mov_b32 s11, s15
-; GFX90A-NEXT:    s_mov_b32 s14, s18
-; GFX90A-NEXT:    s_mov_b32 s15, s19
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_6_3:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__4_3_3_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
-; GFX942-NEXT:    s_mov_b32 s14, s6
-; GFX942-NEXT:    s_mov_b32 s15, s7
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 6, i32 3>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4i64_v4i64__u_4_4_4() {
-; GFX9-LABEL: s_shuffle_v4i64_v4i64__u_4_4_4:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> <i32 poison, i32 4, i32 4, i32 4>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> <i32 4, i32 3, i32 3, i32 3>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__0_4_4_4() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__0_4_4_4:
+define void @s_shuffle_v4i64_v4i64__5_3_3_3() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__5_3_3_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s6
+; GFX900-NEXT:    s_mov_b32 s9, s7
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__0_4_4_4:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__5_3_3_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s6
+; GFX90A-NEXT:    s_mov_b32 s9, s7
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__0_4_4_4:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__5_3_3_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> <i32 0, i32 4, i32 4, i32 4>
+  %vec1 = call <4 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 5, i32 3, i32 3, i32 3>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__1_4_4_4() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__1_4_4_4:
+define void @s_shuffle_v4i64_v4i64__6_3_3_3() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__6_3_3_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s6
-; GFX900-NEXT:    s_mov_b32 s9, s7
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__1_4_4_4:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__6_3_3_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s6
-; GFX90A-NEXT:    s_mov_b32 s9, s7
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__1_4_4_4:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__6_3_3_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> <i32 1, i32 4, i32 4, i32 4>
+  %vec1 = call <4 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 6, i32 3, i32 3, i32 3>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__2_4_4_4() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__2_4_4_4:
+define void @s_shuffle_v4i64_v4i64__7_3_3_3() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_3_3_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__2_4_4_4:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_3_3_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__2_4_4_4:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_3_3_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> <i32 2, i32 4, i32 4, i32 4>
+  %vec1 = call <4 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 3, i32 3, i32 3>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__3_4_4_4() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__3_4_4_4:
+define void @s_shuffle_v4i64_v4i64__7_u_3_3() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_u_3_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s10
 ; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__3_4_4_4:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_u_3_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s10
 ; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__3_4_4_4:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_u_3_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> <i32 3, i32 4, i32 4, i32 4>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4i64_v4i64__4_4_4_4() {
-; GFX9-LABEL: s_shuffle_v4i64_v4i64__4_4_4_4:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> <i32 4, i32 4, i32 4, i32 4>
+  %vec1 = call <4 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 poison, i32 3, i32 3>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__5_4_4_4() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__5_4_4_4:
+define void @s_shuffle_v4i64_v4i64__7_0_3_3() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_0_3_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s6
-; GFX900-NEXT:    s_mov_b32 s9, s7
-; GFX900-NEXT:    s_mov_b32 s10, s4
-; GFX900-NEXT:    s_mov_b32 s11, s5
-; GFX900-NEXT:    s_mov_b32 s12, s4
-; GFX900-NEXT:    s_mov_b32 s13, s5
-; GFX900-NEXT:    s_mov_b32 s14, s4
-; GFX900-NEXT:    s_mov_b32 s15, s5
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s12, s18
+; GFX900-NEXT:    s_mov_b32 s13, s19
+; GFX900-NEXT:    s_mov_b32 s14, s18
+; GFX900-NEXT:    s_mov_b32 s15, s19
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__5_4_4_4:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_0_3_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s6
-; GFX90A-NEXT:    s_mov_b32 s9, s7
-; GFX90A-NEXT:    s_mov_b32 s10, s4
-; GFX90A-NEXT:    s_mov_b32 s11, s5
-; GFX90A-NEXT:    s_mov_b32 s12, s4
-; GFX90A-NEXT:    s_mov_b32 s13, s5
-; GFX90A-NEXT:    s_mov_b32 s14, s4
-; GFX90A-NEXT:    s_mov_b32 s15, s5
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s12, s18
+; GFX90A-NEXT:    s_mov_b32 s13, s19
+; GFX90A-NEXT:    s_mov_b32 s14, s18
+; GFX90A-NEXT:    s_mov_b32 s15, s19
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__5_4_4_4:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_0_3_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 5, i32 4, i32 4, i32 4>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 0, i32 3, i32 3>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__6_4_4_4() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__6_4_4_4:
+define void @s_shuffle_v4i64_v4i64__7_1_3_3() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_1_3_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s10, s4
-; GFX900-NEXT:    s_mov_b32 s11, s5
-; GFX900-NEXT:    s_mov_b32 s12, s4
-; GFX900-NEXT:    s_mov_b32 s13, s5
-; GFX900-NEXT:    s_mov_b32 s14, s4
-; GFX900-NEXT:    s_mov_b32 s15, s5
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[16:23]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s22
+; GFX900-NEXT:    s_mov_b32 s9, s23
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__6_4_4_4:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_1_3_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s10, s4
-; GFX90A-NEXT:    s_mov_b32 s11, s5
-; GFX90A-NEXT:    s_mov_b32 s12, s4
-; GFX90A-NEXT:    s_mov_b32 s13, s5
-; GFX90A-NEXT:    s_mov_b32 s14, s4
-; GFX90A-NEXT:    s_mov_b32 s15, s5
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[16:23]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s22
+; GFX90A-NEXT:    s_mov_b32 s9, s23
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__6_4_4_4:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_1_3_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 6, i32 4, i32 4, i32 4>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 1, i32 3, i32 3>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__7_4_4_4() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_4_4_4:
+define void @s_shuffle_v4i64_v4i64__7_2_3_3() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_2_3_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s10
 ; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s10, s4
-; GFX900-NEXT:    s_mov_b32 s11, s5
-; GFX900-NEXT:    s_mov_b32 s12, s4
-; GFX900-NEXT:    s_mov_b32 s13, s5
-; GFX900-NEXT:    s_mov_b32 s14, s4
-; GFX900-NEXT:    s_mov_b32 s15, s5
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_4_4_4:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_2_3_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s10
 ; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s10, s4
-; GFX90A-NEXT:    s_mov_b32 s11, s5
-; GFX90A-NEXT:    s_mov_b32 s12, s4
-; GFX90A-NEXT:    s_mov_b32 s13, s5
-; GFX90A-NEXT:    s_mov_b32 s14, s4
-; GFX90A-NEXT:    s_mov_b32 s15, s5
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_4_4_4:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_2_3_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 4, i32 4, i32 4>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 2, i32 3, i32 3>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__7_u_4_4() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_u_4_4:
+define void @s_shuffle_v4i64_v4i64__7_4_3_3() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_4_3_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s10
 ; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s12, s4
-; GFX900-NEXT:    s_mov_b32 s13, s5
-; GFX900-NEXT:    s_mov_b32 s14, s4
-; GFX900-NEXT:    s_mov_b32 s15, s5
+; GFX900-NEXT:    s_mov_b32 s10, s4
+; GFX900-NEXT:    s_mov_b32 s11, s5
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_u_4_4:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_4_3_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s10
 ; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s12, s4
-; GFX90A-NEXT:    s_mov_b32 s13, s5
-; GFX90A-NEXT:    s_mov_b32 s14, s4
-; GFX90A-NEXT:    s_mov_b32 s15, s5
+; GFX90A-NEXT:    s_mov_b32 s10, s4
+; GFX90A-NEXT:    s_mov_b32 s11, s5
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_u_4_4:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_4_3_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 poison, i32 4, i32 4>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 4, i32 3, i32 3>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__7_0_4_4() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_0_4_4:
+define void @s_shuffle_v4i64_v4i64__7_5_3_3() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_5_3_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[16:23]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s22
-; GFX900-NEXT:    s_mov_b32 s9, s23
-; GFX900-NEXT:    s_mov_b32 s10, s4
-; GFX900-NEXT:    s_mov_b32 s11, s5
-; GFX900-NEXT:    s_mov_b32 s12, s16
-; GFX900-NEXT:    s_mov_b32 s13, s17
-; GFX900-NEXT:    s_mov_b32 s14, s16
-; GFX900-NEXT:    s_mov_b32 s15, s17
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s12, s18
+; GFX900-NEXT:    s_mov_b32 s13, s19
+; GFX900-NEXT:    s_mov_b32 s14, s18
+; GFX900-NEXT:    s_mov_b32 s15, s19
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_0_4_4:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_5_3_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[16:23]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s22
-; GFX90A-NEXT:    s_mov_b32 s9, s23
-; GFX90A-NEXT:    s_mov_b32 s10, s4
-; GFX90A-NEXT:    s_mov_b32 s11, s5
-; GFX90A-NEXT:    s_mov_b32 s12, s16
-; GFX90A-NEXT:    s_mov_b32 s13, s17
-; GFX90A-NEXT:    s_mov_b32 s14, s16
-; GFX90A-NEXT:    s_mov_b32 s15, s17
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s18
+; GFX90A-NEXT:    s_mov_b32 s13, s19
+; GFX90A-NEXT:    s_mov_b32 s14, s18
+; GFX90A-NEXT:    s_mov_b32 s15, s19
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_0_4_4:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_5_3_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[4:11]
+; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s4
-; GFX942-NEXT:    s_mov_b32 s13, s5
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 0, i32 4, i32 4>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 5, i32 3, i32 3>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__7_1_4_4() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_1_4_4:
+define void @s_shuffle_v4i64_v4i64__7_6_3_3() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_6_3_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -20979,16 +20268,16 @@ define void @s_shuffle_v4i64_v4i64__7_1_4_4() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s22
 ; GFX900-NEXT:    s_mov_b32 s9, s23
-; GFX900-NEXT:    s_mov_b32 s12, s16
-; GFX900-NEXT:    s_mov_b32 s13, s17
-; GFX900-NEXT:    s_mov_b32 s14, s16
-; GFX900-NEXT:    s_mov_b32 s15, s17
+; GFX900-NEXT:    s_mov_b32 s10, s20
+; GFX900-NEXT:    s_mov_b32 s11, s21
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_1_4_4:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_6_3_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -20999,16 +20288,16 @@ define void @s_shuffle_v4i64_v4i64__7_1_4_4() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s22
 ; GFX90A-NEXT:    s_mov_b32 s9, s23
-; GFX90A-NEXT:    s_mov_b32 s12, s16
-; GFX90A-NEXT:    s_mov_b32 s13, s17
-; GFX90A-NEXT:    s_mov_b32 s14, s16
-; GFX90A-NEXT:    s_mov_b32 s15, s17
+; GFX90A-NEXT:    s_mov_b32 s10, s20
+; GFX90A-NEXT:    s_mov_b32 s11, s21
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_1_4_4:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_6_3_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -21017,25 +20306,22 @@ define void @s_shuffle_v4i64_v4i64__7_1_4_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 1, i32 4, i32 4>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 6, i32 3, i32 3>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__7_2_4_4() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_2_4_4:
+define void @s_shuffle_v4i64_v4i64__7_7_3_3() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_3_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -21046,18 +20332,14 @@ define void @s_shuffle_v4i64_v4i64__7_2_4_4() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s10
 ; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s10, s12
-; GFX900-NEXT:    s_mov_b32 s11, s13
-; GFX900-NEXT:    s_mov_b32 s12, s4
-; GFX900-NEXT:    s_mov_b32 s13, s5
-; GFX900-NEXT:    s_mov_b32 s14, s4
-; GFX900-NEXT:    s_mov_b32 s15, s5
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_2_4_4:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_3_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -21068,649 +20350,609 @@ define void @s_shuffle_v4i64_v4i64__7_2_4_4() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s10
 ; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s10, s12
-; GFX90A-NEXT:    s_mov_b32 s11, s13
-; GFX90A-NEXT:    s_mov_b32 s12, s4
-; GFX90A-NEXT:    s_mov_b32 s13, s5
-; GFX90A-NEXT:    s_mov_b32 s14, s4
-; GFX90A-NEXT:    s_mov_b32 s15, s5
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_2_4_4:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_3_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[16:23]
+; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s22
-; GFX942-NEXT:    s_mov_b32 s9, s23
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s16
-; GFX942-NEXT:    s_mov_b32 s13, s17
-; GFX942-NEXT:    s_mov_b32 s14, s16
-; GFX942-NEXT:    s_mov_b32 s15, s17
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 2, i32 4, i32 4>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 3, i32 3>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__7_3_4_4() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_3_4_4:
+define void @s_shuffle_v4i64_v4i64__7_7_u_3() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_u_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[16:23]
+; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s22
-; GFX900-NEXT:    s_mov_b32 s9, s23
-; GFX900-NEXT:    s_mov_b32 s12, s16
-; GFX900-NEXT:    s_mov_b32 s13, s17
-; GFX900-NEXT:    s_mov_b32 s14, s16
-; GFX900-NEXT:    s_mov_b32 s15, s17
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_3_4_4:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_u_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[16:23]
+; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s22
-; GFX90A-NEXT:    s_mov_b32 s9, s23
-; GFX90A-NEXT:    s_mov_b32 s12, s16
-; GFX90A-NEXT:    s_mov_b32 s13, s17
-; GFX90A-NEXT:    s_mov_b32 s14, s16
-; GFX90A-NEXT:    s_mov_b32 s15, s17
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_3_4_4:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_u_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[16:23]
+; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s22
-; GFX942-NEXT:    s_mov_b32 s9, s23
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s16
-; GFX942-NEXT:    s_mov_b32 s13, s17
-; GFX942-NEXT:    s_mov_b32 s14, s16
-; GFX942-NEXT:    s_mov_b32 s15, s17
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 3, i32 4, i32 4>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 poison, i32 3>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__7_5_4_4() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_5_4_4:
+define void @s_shuffle_v4i64_v4i64__7_7_0_3() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_0_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s10
 ; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s10, s6
-; GFX900-NEXT:    s_mov_b32 s11, s7
-; GFX900-NEXT:    s_mov_b32 s12, s4
-; GFX900-NEXT:    s_mov_b32 s13, s5
-; GFX900-NEXT:    s_mov_b32 s14, s4
-; GFX900-NEXT:    s_mov_b32 s15, s5
+; GFX900-NEXT:    s_mov_b32 s14, s18
+; GFX900-NEXT:    s_mov_b32 s15, s19
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_5_4_4:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_0_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s10
 ; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s10, s6
-; GFX90A-NEXT:    s_mov_b32 s11, s7
-; GFX90A-NEXT:    s_mov_b32 s12, s4
-; GFX90A-NEXT:    s_mov_b32 s13, s5
-; GFX90A-NEXT:    s_mov_b32 s14, s4
-; GFX90A-NEXT:    s_mov_b32 s15, s5
+; GFX90A-NEXT:    s_mov_b32 s14, s18
+; GFX90A-NEXT:    s_mov_b32 s15, s19
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_5_4_4:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_0_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s2
-; GFX942-NEXT:    s_mov_b32 s11, s3
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 5, i32 4, i32 4>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 0, i32 3>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__7_6_4_4() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_6_4_4:
+define void @s_shuffle_v4i64_v4i64__7_7_1_3() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_1_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[16:23]
+; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s22
-; GFX900-NEXT:    s_mov_b32 s9, s23
-; GFX900-NEXT:    s_mov_b32 s10, s20
-; GFX900-NEXT:    s_mov_b32 s11, s21
-; GFX900-NEXT:    s_mov_b32 s12, s16
-; GFX900-NEXT:    s_mov_b32 s13, s17
-; GFX900-NEXT:    s_mov_b32 s14, s16
-; GFX900-NEXT:    s_mov_b32 s15, s17
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    s_mov_b32 s14, s18
+; GFX900-NEXT:    s_mov_b32 s15, s19
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_6_4_4:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_1_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[16:23]
+; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s22
-; GFX90A-NEXT:    s_mov_b32 s9, s23
-; GFX90A-NEXT:    s_mov_b32 s10, s20
-; GFX90A-NEXT:    s_mov_b32 s11, s21
-; GFX90A-NEXT:    s_mov_b32 s12, s16
-; GFX90A-NEXT:    s_mov_b32 s13, s17
-; GFX90A-NEXT:    s_mov_b32 s14, s16
-; GFX90A-NEXT:    s_mov_b32 s15, s17
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    s_mov_b32 s14, s18
+; GFX90A-NEXT:    s_mov_b32 s15, s19
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_6_4_4:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_1_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 6, i32 4, i32 4>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 1, i32 3>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__7_7_4_4() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_4_4:
+define void @s_shuffle_v4i64_v4i64__7_7_2_3() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_2_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s10
 ; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s12, s4
-; GFX900-NEXT:    s_mov_b32 s13, s5
-; GFX900-NEXT:    s_mov_b32 s14, s4
-; GFX900-NEXT:    s_mov_b32 s15, s5
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_4_4:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_2_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s10
 ; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s12, s4
-; GFX90A-NEXT:    s_mov_b32 s13, s5
-; GFX90A-NEXT:    s_mov_b32 s14, s4
-; GFX90A-NEXT:    s_mov_b32 s15, s5
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_4_4:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_2_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 4, i32 4>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 2, i32 3>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__7_7_u_4() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_u_4:
+define void @s_shuffle_v4i64_v4i64__7_7_4_3() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_4_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s10
 ; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s14, s4
-; GFX900-NEXT:    s_mov_b32 s15, s5
+; GFX900-NEXT:    s_mov_b32 s12, s4
+; GFX900-NEXT:    s_mov_b32 s13, s5
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_u_4:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_4_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s10
 ; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s14, s4
-; GFX90A-NEXT:    s_mov_b32 s15, s5
+; GFX90A-NEXT:    s_mov_b32 s12, s4
+; GFX90A-NEXT:    s_mov_b32 s13, s5
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_u_4:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_4_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 poison, i32 4>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 4, i32 3>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__7_7_0_4() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_0_4:
+define void @s_shuffle_v4i64_v4i64__7_7_5_3() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_5_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[16:23]
+; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s22
-; GFX900-NEXT:    s_mov_b32 s9, s23
-; GFX900-NEXT:    s_mov_b32 s10, s22
-; GFX900-NEXT:    s_mov_b32 s11, s23
-; GFX900-NEXT:    s_mov_b32 s12, s4
-; GFX900-NEXT:    s_mov_b32 s13, s5
-; GFX900-NEXT:    s_mov_b32 s14, s16
-; GFX900-NEXT:    s_mov_b32 s15, s17
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s12, s6
+; GFX900-NEXT:    s_mov_b32 s13, s7
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_0_4:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_5_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[16:23]
+; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s22
-; GFX90A-NEXT:    s_mov_b32 s9, s23
-; GFX90A-NEXT:    s_mov_b32 s10, s22
-; GFX90A-NEXT:    s_mov_b32 s11, s23
-; GFX90A-NEXT:    s_mov_b32 s12, s4
-; GFX90A-NEXT:    s_mov_b32 s13, s5
-; GFX90A-NEXT:    s_mov_b32 s14, s16
-; GFX90A-NEXT:    s_mov_b32 s15, s17
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s12, s6
+; GFX90A-NEXT:    s_mov_b32 s13, s7
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_0_4:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_5_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s0
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[4:11]
+; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 0, i32 4>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 5, i32 3>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__7_7_1_4() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_1_4:
+define void @s_shuffle_v4i64_v4i64__7_7_6_3() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_6_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[16:23]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s22
-; GFX900-NEXT:    s_mov_b32 s9, s23
-; GFX900-NEXT:    s_mov_b32 s10, s22
-; GFX900-NEXT:    s_mov_b32 s11, s23
-; GFX900-NEXT:    s_mov_b32 s12, s6
-; GFX900-NEXT:    s_mov_b32 s13, s7
-; GFX900-NEXT:    s_mov_b32 s14, s16
-; GFX900-NEXT:    s_mov_b32 s15, s17
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    s_mov_b32 s14, s18
+; GFX900-NEXT:    s_mov_b32 s15, s19
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_1_4:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_6_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[16:23]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s22
-; GFX90A-NEXT:    s_mov_b32 s9, s23
-; GFX90A-NEXT:    s_mov_b32 s10, s22
-; GFX90A-NEXT:    s_mov_b32 s11, s23
-; GFX90A-NEXT:    s_mov_b32 s12, s6
-; GFX90A-NEXT:    s_mov_b32 s13, s7
-; GFX90A-NEXT:    s_mov_b32 s14, s16
-; GFX90A-NEXT:    s_mov_b32 s15, s17
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    s_mov_b32 s14, s18
+; GFX90A-NEXT:    s_mov_b32 s15, s19
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_1_4:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_6_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s2
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[4:11]
+; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 1, i32 4>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 6, i32 3>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__7_7_2_4() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_2_4:
+define void @s_shuffle_v4i64_v4i64__u_4_4_4() {
+; GFX9-LABEL: s_shuffle_v4i64_v4i64__u_4_4_4:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; use s[8:15]
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> <i32 poison, i32 4, i32 4, i32 4>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4i64_v4i64__0_4_4_4() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__0_4_4_4:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s14, s4
-; GFX900-NEXT:    s_mov_b32 s15, s5
-; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_2_4:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__0_4_4_4:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s14, s4
-; GFX90A-NEXT:    s_mov_b32 s15, s5
-; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_2_4:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__0_4_4_4:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 2, i32 4>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> <i32 0, i32 4, i32 4, i32 4>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__7_7_3_4() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_3_4:
+define void @s_shuffle_v4i64_v4i64__1_4_4_4() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__1_4_4_4:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s12, s14
-; GFX900-NEXT:    s_mov_b32 s13, s15
-; GFX900-NEXT:    s_mov_b32 s14, s4
-; GFX900-NEXT:    s_mov_b32 s15, s5
+; GFX900-NEXT:    s_mov_b32 s8, s6
+; GFX900-NEXT:    s_mov_b32 s9, s7
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_3_4:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__1_4_4_4:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s12, s14
-; GFX90A-NEXT:    s_mov_b32 s13, s15
-; GFX90A-NEXT:    s_mov_b32 s14, s4
-; GFX90A-NEXT:    s_mov_b32 s15, s5
+; GFX90A-NEXT:    s_mov_b32 s8, s6
+; GFX90A-NEXT:    s_mov_b32 s9, s7
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_3_4:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__1_4_4_4:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[16:23]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s22
-; GFX942-NEXT:    s_mov_b32 s9, s23
-; GFX942-NEXT:    s_mov_b32 s10, s22
-; GFX942-NEXT:    s_mov_b32 s11, s23
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
-; GFX942-NEXT:    s_mov_b32 s14, s16
-; GFX942-NEXT:    s_mov_b32 s15, s17
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 3, i32 4>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> <i32 1, i32 4, i32 4, i32 4>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__7_7_5_4() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_5_4:
+define void @s_shuffle_v4i64_v4i64__2_4_4_4() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__2_4_4_4:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__2_4_4_4:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__2_4_4_4:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> <i32 2, i32 4, i32 4, i32 4>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4i64_v4i64__3_4_4_4() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__3_4_4_4:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
 ; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s12, s6
-; GFX900-NEXT:    s_mov_b32 s13, s7
-; GFX900-NEXT:    s_mov_b32 s14, s4
-; GFX900-NEXT:    s_mov_b32 s15, s5
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_5_4:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__3_4_4_4:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -21718,8 +20960,74 @@ define void @s_shuffle_v4i64_v4i64__7_7_5_4() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s10
 ; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s12, s6
-; GFX90A-NEXT:    s_mov_b32 s13, s7
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__3_4_4_4:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> <i32 3, i32 4, i32 4, i32 4>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4i64_v4i64__4_4_4_4() {
+; GFX9-LABEL: s_shuffle_v4i64_v4i64__4_4_4_4:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; use s[8:15]
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> <i32 4, i32 4, i32 4, i32 4>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4i64_v4i64__5_4_4_4() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__5_4_4_4:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s6
+; GFX900-NEXT:    s_mov_b32 s9, s7
+; GFX900-NEXT:    s_mov_b32 s10, s4
+; GFX900-NEXT:    s_mov_b32 s11, s5
+; GFX900-NEXT:    s_mov_b32 s12, s4
+; GFX900-NEXT:    s_mov_b32 s13, s5
+; GFX900-NEXT:    s_mov_b32 s14, s4
+; GFX900-NEXT:    s_mov_b32 s15, s5
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__5_4_4_4:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s6
+; GFX90A-NEXT:    s_mov_b32 s9, s7
+; GFX90A-NEXT:    s_mov_b32 s10, s4
+; GFX90A-NEXT:    s_mov_b32 s11, s5
+; GFX90A-NEXT:    s_mov_b32 s12, s4
+; GFX90A-NEXT:    s_mov_b32 s13, s5
 ; GFX90A-NEXT:    s_mov_b32 s14, s4
 ; GFX90A-NEXT:    s_mov_b32 s15, s5
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -21727,129 +21035,1656 @@ define void @s_shuffle_v4i64_v4i64__7_7_5_4() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_5_4:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__5_4_4_4:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 5, i32 4>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 5, i32 4, i32 4, i32 4>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__7_7_6_4() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_6_4:
+define void @s_shuffle_v4i64_v4i64__6_4_4_4() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__6_4_4_4:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[16:23]
+; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s22
-; GFX900-NEXT:    s_mov_b32 s9, s23
-; GFX900-NEXT:    s_mov_b32 s10, s22
-; GFX900-NEXT:    s_mov_b32 s11, s23
-; GFX900-NEXT:    s_mov_b32 s12, s20
-; GFX900-NEXT:    s_mov_b32 s13, s21
-; GFX900-NEXT:    s_mov_b32 s14, s16
-; GFX900-NEXT:    s_mov_b32 s15, s17
+; GFX900-NEXT:    s_mov_b32 s10, s4
+; GFX900-NEXT:    s_mov_b32 s11, s5
+; GFX900-NEXT:    s_mov_b32 s12, s4
+; GFX900-NEXT:    s_mov_b32 s13, s5
+; GFX900-NEXT:    s_mov_b32 s14, s4
+; GFX900-NEXT:    s_mov_b32 s15, s5
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_6_4:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__6_4_4_4:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[16:23]
+; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s22
-; GFX90A-NEXT:    s_mov_b32 s9, s23
-; GFX90A-NEXT:    s_mov_b32 s10, s22
-; GFX90A-NEXT:    s_mov_b32 s11, s23
-; GFX90A-NEXT:    s_mov_b32 s12, s20
-; GFX90A-NEXT:    s_mov_b32 s13, s21
-; GFX90A-NEXT:    s_mov_b32 s14, s16
-; GFX90A-NEXT:    s_mov_b32 s15, s17
+; GFX90A-NEXT:    s_mov_b32 s10, s4
+; GFX90A-NEXT:    s_mov_b32 s11, s5
+; GFX90A-NEXT:    s_mov_b32 s12, s4
+; GFX90A-NEXT:    s_mov_b32 s13, s5
+; GFX90A-NEXT:    s_mov_b32 s14, s4
+; GFX90A-NEXT:    s_mov_b32 s15, s5
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_6_4:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__6_4_4_4:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x i64> asm "; def $0", "=s"()
+  %vec1 = call <4 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 6, i32 4, i32 4, i32 4>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4i64_v4i64__7_4_4_4() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_4_4_4:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s10, s4
+; GFX900-NEXT:    s_mov_b32 s11, s5
+; GFX900-NEXT:    s_mov_b32 s12, s4
+; GFX900-NEXT:    s_mov_b32 s13, s5
+; GFX900-NEXT:    s_mov_b32 s14, s4
+; GFX900-NEXT:    s_mov_b32 s15, s5
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_4_4_4:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s10, s4
+; GFX90A-NEXT:    s_mov_b32 s11, s5
+; GFX90A-NEXT:    s_mov_b32 s12, s4
+; GFX90A-NEXT:    s_mov_b32 s13, s5
+; GFX90A-NEXT:    s_mov_b32 s14, s4
+; GFX90A-NEXT:    s_mov_b32 s15, s5
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_4_4_4:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x i64> asm "; def $0", "=s"()
+  %vec1 = call <4 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 4, i32 4, i32 4>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4i64_v4i64__7_u_4_4() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_u_4_4:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s12, s4
+; GFX900-NEXT:    s_mov_b32 s13, s5
+; GFX900-NEXT:    s_mov_b32 s14, s4
+; GFX900-NEXT:    s_mov_b32 s15, s5
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_u_4_4:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s12, s4
+; GFX90A-NEXT:    s_mov_b32 s13, s5
+; GFX90A-NEXT:    s_mov_b32 s14, s4
+; GFX90A-NEXT:    s_mov_b32 s15, s5
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_u_4_4:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x i64> asm "; def $0", "=s"()
+  %vec1 = call <4 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 poison, i32 4, i32 4>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4i64_v4i64__7_0_4_4() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_0_4_4:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[16:23]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s22
+; GFX900-NEXT:    s_mov_b32 s9, s23
+; GFX900-NEXT:    s_mov_b32 s10, s4
+; GFX900-NEXT:    s_mov_b32 s11, s5
+; GFX900-NEXT:    s_mov_b32 s12, s16
+; GFX900-NEXT:    s_mov_b32 s13, s17
+; GFX900-NEXT:    s_mov_b32 s14, s16
+; GFX900-NEXT:    s_mov_b32 s15, s17
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_0_4_4:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[16:23]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s22
+; GFX90A-NEXT:    s_mov_b32 s9, s23
+; GFX90A-NEXT:    s_mov_b32 s10, s4
+; GFX90A-NEXT:    s_mov_b32 s11, s5
+; GFX90A-NEXT:    s_mov_b32 s12, s16
+; GFX90A-NEXT:    s_mov_b32 s13, s17
+; GFX90A-NEXT:    s_mov_b32 s14, s16
+; GFX90A-NEXT:    s_mov_b32 s15, s17
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_0_4_4:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[4:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x i64> asm "; def $0", "=s"()
+  %vec1 = call <4 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 0, i32 4, i32 4>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4i64_v4i64__7_1_4_4() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_1_4_4:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[16:23]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s22
+; GFX900-NEXT:    s_mov_b32 s9, s23
+; GFX900-NEXT:    s_mov_b32 s12, s16
+; GFX900-NEXT:    s_mov_b32 s13, s17
+; GFX900-NEXT:    s_mov_b32 s14, s16
+; GFX900-NEXT:    s_mov_b32 s15, s17
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_1_4_4:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[16:23]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s22
+; GFX90A-NEXT:    s_mov_b32 s9, s23
+; GFX90A-NEXT:    s_mov_b32 s12, s16
+; GFX90A-NEXT:    s_mov_b32 s13, s17
+; GFX90A-NEXT:    s_mov_b32 s14, s16
+; GFX90A-NEXT:    s_mov_b32 s15, s17
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_1_4_4:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x i64> asm "; def $0", "=s"()
+  %vec1 = call <4 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 1, i32 4, i32 4>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4i64_v4i64__7_2_4_4() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_2_4_4:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s12, s4
+; GFX900-NEXT:    s_mov_b32 s13, s5
+; GFX900-NEXT:    s_mov_b32 s14, s4
+; GFX900-NEXT:    s_mov_b32 s15, s5
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_2_4_4:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s12, s4
+; GFX90A-NEXT:    s_mov_b32 s13, s5
+; GFX90A-NEXT:    s_mov_b32 s14, s4
+; GFX90A-NEXT:    s_mov_b32 s15, s5
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_2_4_4:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[16:23]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[22:23]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[16:17]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[16:17]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x i64> asm "; def $0", "=s"()
+  %vec1 = call <4 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 2, i32 4, i32 4>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4i64_v4i64__7_3_4_4() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_3_4_4:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[16:23]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s22
+; GFX900-NEXT:    s_mov_b32 s9, s23
+; GFX900-NEXT:    s_mov_b32 s12, s16
+; GFX900-NEXT:    s_mov_b32 s13, s17
+; GFX900-NEXT:    s_mov_b32 s14, s16
+; GFX900-NEXT:    s_mov_b32 s15, s17
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_3_4_4:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[16:23]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s22
+; GFX90A-NEXT:    s_mov_b32 s9, s23
+; GFX90A-NEXT:    s_mov_b32 s12, s16
+; GFX90A-NEXT:    s_mov_b32 s13, s17
+; GFX90A-NEXT:    s_mov_b32 s14, s16
+; GFX90A-NEXT:    s_mov_b32 s15, s17
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_3_4_4:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[16:23]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[22:23]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[16:17]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[16:17]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x i64> asm "; def $0", "=s"()
+  %vec1 = call <4 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 3, i32 4, i32 4>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4i64_v4i64__7_5_4_4() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_5_4_4:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s10, s6
+; GFX900-NEXT:    s_mov_b32 s11, s7
+; GFX900-NEXT:    s_mov_b32 s12, s4
+; GFX900-NEXT:    s_mov_b32 s13, s5
+; GFX900-NEXT:    s_mov_b32 s14, s4
+; GFX900-NEXT:    s_mov_b32 s15, s5
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_5_4_4:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s10, s6
+; GFX90A-NEXT:    s_mov_b32 s11, s7
+; GFX90A-NEXT:    s_mov_b32 s12, s4
+; GFX90A-NEXT:    s_mov_b32 s13, s5
+; GFX90A-NEXT:    s_mov_b32 s14, s4
+; GFX90A-NEXT:    s_mov_b32 s15, s5
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_5_4_4:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x i64> asm "; def $0", "=s"()
+  %vec1 = call <4 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 5, i32 4, i32 4>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4i64_v4i64__7_6_4_4() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_6_4_4:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[16:23]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s22
+; GFX900-NEXT:    s_mov_b32 s9, s23
+; GFX900-NEXT:    s_mov_b32 s10, s20
+; GFX900-NEXT:    s_mov_b32 s11, s21
+; GFX900-NEXT:    s_mov_b32 s12, s16
+; GFX900-NEXT:    s_mov_b32 s13, s17
+; GFX900-NEXT:    s_mov_b32 s14, s16
+; GFX900-NEXT:    s_mov_b32 s15, s17
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_6_4_4:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[16:23]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s22
+; GFX90A-NEXT:    s_mov_b32 s9, s23
+; GFX90A-NEXT:    s_mov_b32 s10, s20
+; GFX90A-NEXT:    s_mov_b32 s11, s21
+; GFX90A-NEXT:    s_mov_b32 s12, s16
+; GFX90A-NEXT:    s_mov_b32 s13, s17
+; GFX90A-NEXT:    s_mov_b32 s14, s16
+; GFX90A-NEXT:    s_mov_b32 s15, s17
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_6_4_4:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x i64> asm "; def $0", "=s"()
+  %vec1 = call <4 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 6, i32 4, i32 4>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4i64_v4i64__7_7_4_4() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_4_4:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s12, s4
+; GFX900-NEXT:    s_mov_b32 s13, s5
+; GFX900-NEXT:    s_mov_b32 s14, s4
+; GFX900-NEXT:    s_mov_b32 s15, s5
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_4_4:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s12, s4
+; GFX90A-NEXT:    s_mov_b32 s13, s5
+; GFX90A-NEXT:    s_mov_b32 s14, s4
+; GFX90A-NEXT:    s_mov_b32 s15, s5
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_4_4:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x i64> asm "; def $0", "=s"()
+  %vec1 = call <4 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 4, i32 4>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4i64_v4i64__7_7_u_4() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_u_4:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s14, s4
+; GFX900-NEXT:    s_mov_b32 s15, s5
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_u_4:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s4
+; GFX90A-NEXT:    s_mov_b32 s15, s5
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_u_4:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x i64> asm "; def $0", "=s"()
+  %vec1 = call <4 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 poison, i32 4>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4i64_v4i64__7_7_0_4() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_0_4:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[16:23]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s22
+; GFX900-NEXT:    s_mov_b32 s9, s23
+; GFX900-NEXT:    s_mov_b32 s10, s22
+; GFX900-NEXT:    s_mov_b32 s11, s23
+; GFX900-NEXT:    s_mov_b32 s12, s4
+; GFX900-NEXT:    s_mov_b32 s13, s5
+; GFX900-NEXT:    s_mov_b32 s14, s16
+; GFX900-NEXT:    s_mov_b32 s15, s17
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_0_4:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[16:23]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s22
+; GFX90A-NEXT:    s_mov_b32 s9, s23
+; GFX90A-NEXT:    s_mov_b32 s10, s22
+; GFX90A-NEXT:    s_mov_b32 s11, s23
+; GFX90A-NEXT:    s_mov_b32 s12, s4
+; GFX90A-NEXT:    s_mov_b32 s13, s5
+; GFX90A-NEXT:    s_mov_b32 s14, s16
+; GFX90A-NEXT:    s_mov_b32 s15, s17
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_0_4:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[4:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x i64> asm "; def $0", "=s"()
+  %vec1 = call <4 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 0, i32 4>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4i64_v4i64__7_7_1_4() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_1_4:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[16:23]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s22
+; GFX900-NEXT:    s_mov_b32 s9, s23
+; GFX900-NEXT:    s_mov_b32 s10, s22
+; GFX900-NEXT:    s_mov_b32 s11, s23
+; GFX900-NEXT:    s_mov_b32 s12, s6
+; GFX900-NEXT:    s_mov_b32 s13, s7
+; GFX900-NEXT:    s_mov_b32 s14, s16
+; GFX900-NEXT:    s_mov_b32 s15, s17
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_1_4:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[16:23]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s22
+; GFX90A-NEXT:    s_mov_b32 s9, s23
+; GFX90A-NEXT:    s_mov_b32 s10, s22
+; GFX90A-NEXT:    s_mov_b32 s11, s23
+; GFX90A-NEXT:    s_mov_b32 s12, s6
+; GFX90A-NEXT:    s_mov_b32 s13, s7
+; GFX90A-NEXT:    s_mov_b32 s14, s16
+; GFX90A-NEXT:    s_mov_b32 s15, s17
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_1_4:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[4:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x i64> asm "; def $0", "=s"()
+  %vec1 = call <4 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 1, i32 4>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4i64_v4i64__7_7_2_4() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_2_4:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s14, s4
+; GFX900-NEXT:    s_mov_b32 s15, s5
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_2_4:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s4
+; GFX90A-NEXT:    s_mov_b32 s15, s5
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_2_4:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x i64> asm "; def $0", "=s"()
+  %vec1 = call <4 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 2, i32 4>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4i64_v4i64__7_7_3_4() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_3_4:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    s_mov_b32 s14, s4
+; GFX900-NEXT:    s_mov_b32 s15, s5
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_3_4:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    s_mov_b32 s14, s4
+; GFX90A-NEXT:    s_mov_b32 s15, s5
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_3_4:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[16:23]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[22:23]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[22:23]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[16:17]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x i64> asm "; def $0", "=s"()
+  %vec1 = call <4 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 3, i32 4>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4i64_v4i64__7_7_5_4() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_5_4:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s12, s6
+; GFX900-NEXT:    s_mov_b32 s13, s7
+; GFX900-NEXT:    s_mov_b32 s14, s4
+; GFX900-NEXT:    s_mov_b32 s15, s5
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_5_4:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s12, s6
+; GFX90A-NEXT:    s_mov_b32 s13, s7
+; GFX90A-NEXT:    s_mov_b32 s14, s4
+; GFX90A-NEXT:    s_mov_b32 s15, s5
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_5_4:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x i64> asm "; def $0", "=s"()
+  %vec1 = call <4 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 5, i32 4>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4i64_v4i64__7_7_6_4() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_6_4:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[16:23]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s22
+; GFX900-NEXT:    s_mov_b32 s9, s23
+; GFX900-NEXT:    s_mov_b32 s10, s22
+; GFX900-NEXT:    s_mov_b32 s11, s23
+; GFX900-NEXT:    s_mov_b32 s12, s20
+; GFX900-NEXT:    s_mov_b32 s13, s21
+; GFX900-NEXT:    s_mov_b32 s14, s16
+; GFX900-NEXT:    s_mov_b32 s15, s17
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_6_4:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[16:23]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s22
+; GFX90A-NEXT:    s_mov_b32 s9, s23
+; GFX90A-NEXT:    s_mov_b32 s10, s22
+; GFX90A-NEXT:    s_mov_b32 s11, s23
+; GFX90A-NEXT:    s_mov_b32 s12, s20
+; GFX90A-NEXT:    s_mov_b32 s13, s21
+; GFX90A-NEXT:    s_mov_b32 s14, s16
+; GFX90A-NEXT:    s_mov_b32 s15, s17
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_6_4:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x i64> asm "; def $0", "=s"()
+  %vec1 = call <4 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 6, i32 4>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4i64_v4i64__u_5_5_5() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__u_5_5_5:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    s_mov_b32 s14, s10
+; GFX900-NEXT:    s_mov_b32 s15, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__u_5_5_5:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s10
+; GFX90A-NEXT:    s_mov_b32 s15, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__u_5_5_5:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x i64> asm "; def $0", "=s"()
+  %vec1 = call <4 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 poison, i32 5, i32 5, i32 5>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4i64_v4i64__0_5_5_5() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__0_5_5_5:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__0_5_5_5:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__0_5_5_5:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x i64> asm "; def $0", "=s"()
+  %vec1 = call <4 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 0, i32 5, i32 5, i32 5>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4i64_v4i64__1_5_5_5() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__1_5_5_5:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s6
+; GFX900-NEXT:    s_mov_b32 s9, s7
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    s_mov_b32 s14, s10
+; GFX900-NEXT:    s_mov_b32 s15, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__1_5_5_5:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s6
+; GFX90A-NEXT:    s_mov_b32 s9, s7
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s10
+; GFX90A-NEXT:    s_mov_b32 s15, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__1_5_5_5:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x i64> asm "; def $0", "=s"()
+  %vec1 = call <4 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 1, i32 5, i32 5, i32 5>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4i64_v4i64__2_5_5_5() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__2_5_5_5:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s16
+; GFX900-NEXT:    s_mov_b32 s9, s17
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    s_mov_b32 s14, s10
+; GFX900-NEXT:    s_mov_b32 s15, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__2_5_5_5:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s16
+; GFX90A-NEXT:    s_mov_b32 s9, s17
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s10
+; GFX90A-NEXT:    s_mov_b32 s15, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__2_5_5_5:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x i64> asm "; def $0", "=s"()
+  %vec1 = call <4 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 2, i32 5, i32 5, i32 5>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4i64_v4i64__3_5_5_5() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__3_5_5_5:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s18
+; GFX900-NEXT:    s_mov_b32 s9, s19
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    s_mov_b32 s14, s10
+; GFX900-NEXT:    s_mov_b32 s15, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__3_5_5_5:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s18
+; GFX90A-NEXT:    s_mov_b32 s9, s19
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s10
+; GFX90A-NEXT:    s_mov_b32 s15, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__3_5_5_5:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x i64> asm "; def $0", "=s"()
+  %vec1 = call <4 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 3, i32 5, i32 5, i32 5>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4i64_v4i64__4_5_5_5() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__4_5_5_5:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    s_mov_b32 s14, s10
+; GFX900-NEXT:    s_mov_b32 s15, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__4_5_5_5:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s10
+; GFX90A-NEXT:    s_mov_b32 s15, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__4_5_5_5:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x i64> asm "; def $0", "=s"()
+  %vec1 = call <4 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 4, i32 5, i32 5, i32 5>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4i64_v4i64__5_5_5_5() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__5_5_5_5:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    s_mov_b32 s14, s10
+; GFX900-NEXT:    s_mov_b32 s15, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__5_5_5_5:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s10
+; GFX90A-NEXT:    s_mov_b32 s15, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__5_5_5_5:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x i64> asm "; def $0", "=s"()
+  %vec1 = call <4 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 5, i32 5, i32 5, i32 5>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4i64_v4i64__6_5_5_5() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__6_5_5_5:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s12
+; GFX900-NEXT:    s_mov_b32 s9, s13
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    s_mov_b32 s14, s10
+; GFX900-NEXT:    s_mov_b32 s15, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__6_5_5_5:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s12
+; GFX90A-NEXT:    s_mov_b32 s9, s13
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s10
+; GFX90A-NEXT:    s_mov_b32 s15, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__6_5_5_5:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x i64> asm "; def $0", "=s"()
+  %vec1 = call <4 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 6, i32 5, i32 5, i32 5>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4i64_v4i64__7_5_5_5() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_5_5_5:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    s_mov_b32 s14, s10
+; GFX900-NEXT:    s_mov_b32 s15, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_5_5_5:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s10
+; GFX90A-NEXT:    s_mov_b32 s15, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_5_5_5:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x i64> asm "; def $0", "=s"()
+  %vec1 = call <4 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 5, i32 5, i32 5>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4i64_v4i64__7_u_5_5() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_u_5_5:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s12, s6
+; GFX900-NEXT:    s_mov_b32 s13, s7
+; GFX900-NEXT:    s_mov_b32 s14, s6
+; GFX900-NEXT:    s_mov_b32 s15, s7
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_u_5_5:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s12, s6
+; GFX90A-NEXT:    s_mov_b32 s13, s7
+; GFX90A-NEXT:    s_mov_b32 s14, s6
+; GFX90A-NEXT:    s_mov_b32 s15, s7
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_u_5_5:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s4
-; GFX942-NEXT:    s_mov_b32 s13, s5
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 6, i32 4>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4i64_v4i64__u_5_5_5() {
-; GFX9-LABEL: s_shuffle_v4i64_v4i64__u_5_5_5:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    s_mov_b32 s14, s10
-; GFX9-NEXT:    s_mov_b32 s15, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 poison, i32 5, i32 5, i32 5>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 poison, i32 5, i32 5>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__0_5_5_5() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__0_5_5_5:
+define void @s_shuffle_v4i64_v4i64__7_0_5_5() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_0_5_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s10, s14
-; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    s_mov_b32 s8, s18
+; GFX900-NEXT:    s_mov_b32 s9, s19
+; GFX900-NEXT:    s_mov_b32 s10, s4
+; GFX900-NEXT:    s_mov_b32 s11, s5
 ; GFX900-NEXT:    s_mov_b32 s12, s14
 ; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -21857,17 +22692,19 @@ define void @s_shuffle_v4i64_v4i64__0_5_5_5() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__0_5_5_5:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_0_5_5:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s10, s14
-; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    s_mov_b32 s8, s18
+; GFX90A-NEXT:    s_mov_b32 s9, s19
+; GFX90A-NEXT:    s_mov_b32 s10, s4
+; GFX90A-NEXT:    s_mov_b32 s11, s5
 ; GFX90A-NEXT:    s_mov_b32 s12, s14
 ; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -21875,74 +22712,69 @@ define void @s_shuffle_v4i64_v4i64__0_5_5_5() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__0_5_5_5:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_0_5_5:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s2
-; GFX942-NEXT:    s_mov_b32 s11, s3
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 0, i32 5, i32 5, i32 5>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 0, i32 5, i32 5>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__1_5_5_5() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__1_5_5_5:
+define void @s_shuffle_v4i64_v4i64__7_1_5_5() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_1_5_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s6
-; GFX900-NEXT:    s_mov_b32 s9, s7
-; GFX900-NEXT:    s_mov_b32 s12, s10
-; GFX900-NEXT:    s_mov_b32 s13, s11
-; GFX900-NEXT:    s_mov_b32 s14, s10
-; GFX900-NEXT:    s_mov_b32 s15, s11
+; GFX900-NEXT:    s_mov_b32 s8, s18
+; GFX900-NEXT:    s_mov_b32 s9, s19
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__1_5_5_5:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_1_5_5:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s6
-; GFX90A-NEXT:    s_mov_b32 s9, s7
-; GFX90A-NEXT:    s_mov_b32 s12, s10
-; GFX90A-NEXT:    s_mov_b32 s13, s11
-; GFX90A-NEXT:    s_mov_b32 s14, s10
-; GFX90A-NEXT:    s_mov_b32 s15, s11
+; GFX90A-NEXT:    s_mov_b32 s8, s18
+; GFX90A-NEXT:    s_mov_b32 s9, s19
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__1_5_5_5:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_1_5_5:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -21951,253 +22783,150 @@ define void @s_shuffle_v4i64_v4i64__1_5_5_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s12, s10
-; GFX942-NEXT:    s_mov_b32 s13, s11
-; GFX942-NEXT:    s_mov_b32 s14, s10
-; GFX942-NEXT:    s_mov_b32 s15, s11
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 1, i32 5, i32 5, i32 5>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 1, i32 5, i32 5>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__2_5_5_5() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__2_5_5_5:
+define void @s_shuffle_v4i64_v4i64__7_2_5_5() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_2_5_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s16
-; GFX900-NEXT:    s_mov_b32 s9, s17
-; GFX900-NEXT:    s_mov_b32 s12, s10
-; GFX900-NEXT:    s_mov_b32 s13, s11
-; GFX900-NEXT:    s_mov_b32 s14, s10
-; GFX900-NEXT:    s_mov_b32 s15, s11
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s12, s6
+; GFX900-NEXT:    s_mov_b32 s13, s7
+; GFX900-NEXT:    s_mov_b32 s14, s6
+; GFX900-NEXT:    s_mov_b32 s15, s7
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__2_5_5_5:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_2_5_5:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s16
-; GFX90A-NEXT:    s_mov_b32 s9, s17
-; GFX90A-NEXT:    s_mov_b32 s12, s10
-; GFX90A-NEXT:    s_mov_b32 s13, s11
-; GFX90A-NEXT:    s_mov_b32 s14, s10
-; GFX90A-NEXT:    s_mov_b32 s15, s11
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s12, s6
+; GFX90A-NEXT:    s_mov_b32 s13, s7
+; GFX90A-NEXT:    s_mov_b32 s14, s6
+; GFX90A-NEXT:    s_mov_b32 s15, s7
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__2_5_5_5:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_2_5_5:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ; def s[12:19]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s12, s10
-; GFX942-NEXT:    s_mov_b32 s13, s11
-; GFX942-NEXT:    s_mov_b32 s14, s10
-; GFX942-NEXT:    s_mov_b32 s15, s11
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[18:19]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 2, i32 5, i32 5, i32 5>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 2, i32 5, i32 5>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__3_5_5_5() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__3_5_5_5:
+define void @s_shuffle_v4i64_v4i64__7_3_5_5() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_3_5_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s18
 ; GFX900-NEXT:    s_mov_b32 s9, s19
-; GFX900-NEXT:    s_mov_b32 s12, s10
-; GFX900-NEXT:    s_mov_b32 s13, s11
-; GFX900-NEXT:    s_mov_b32 s14, s10
-; GFX900-NEXT:    s_mov_b32 s15, s11
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__3_5_5_5:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_3_5_5:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s18
 ; GFX90A-NEXT:    s_mov_b32 s9, s19
-; GFX90A-NEXT:    s_mov_b32 s12, s10
-; GFX90A-NEXT:    s_mov_b32 s13, s11
-; GFX90A-NEXT:    s_mov_b32 s14, s10
-; GFX90A-NEXT:    s_mov_b32 s15, s11
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__3_5_5_5:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_3_5_5:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ; def s[12:19]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s12, s10
-; GFX942-NEXT:    s_mov_b32 s13, s11
-; GFX942-NEXT:    s_mov_b32 s14, s10
-; GFX942-NEXT:    s_mov_b32 s15, s11
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[18:19]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 3, i32 5, i32 5, i32 5>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4i64_v4i64__4_5_5_5() {
-; GFX9-LABEL: s_shuffle_v4i64_v4i64__4_5_5_5:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    s_mov_b32 s14, s10
-; GFX9-NEXT:    s_mov_b32 s15, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 4, i32 5, i32 5, i32 5>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4i64_v4i64__5_5_5_5() {
-; GFX9-LABEL: s_shuffle_v4i64_v4i64__5_5_5_5:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    s_mov_b32 s14, s10
-; GFX9-NEXT:    s_mov_b32 s15, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 5, i32 5, i32 5, i32 5>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4i64_v4i64__6_5_5_5() {
-; GFX9-LABEL: s_shuffle_v4i64_v4i64__6_5_5_5:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s12
-; GFX9-NEXT:    s_mov_b32 s9, s13
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    s_mov_b32 s14, s10
-; GFX9-NEXT:    s_mov_b32 s15, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 6, i32 5, i32 5, i32 5>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4i64_v4i64__7_5_5_5() {
-; GFX9-LABEL: s_shuffle_v4i64_v4i64__7_5_5_5:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    s_mov_b32 s14, s10
-; GFX9-NEXT:    s_mov_b32 s15, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 5, i32 5, i32 5>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 3, i32 5, i32 5>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__7_u_5_5() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_u_5_5:
+define void @s_shuffle_v4i64_v4i64__7_4_5_5() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_4_5_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -22205,6 +22934,8 @@ define void @s_shuffle_v4i64_v4i64__7_u_5_5() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s10
 ; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s10, s4
+; GFX900-NEXT:    s_mov_b32 s11, s5
 ; GFX900-NEXT:    s_mov_b32 s12, s6
 ; GFX900-NEXT:    s_mov_b32 s13, s7
 ; GFX900-NEXT:    s_mov_b32 s14, s6
@@ -22214,7 +22945,7 @@ define void @s_shuffle_v4i64_v4i64__7_u_5_5() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_u_5_5:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_4_5_5:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -22222,6 +22953,8 @@ define void @s_shuffle_v4i64_v4i64__7_u_5_5() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s10
 ; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s10, s4
+; GFX90A-NEXT:    s_mov_b32 s11, s5
 ; GFX90A-NEXT:    s_mov_b32 s12, s6
 ; GFX90A-NEXT:    s_mov_b32 s13, s7
 ; GFX90A-NEXT:    s_mov_b32 s14, s6
@@ -22231,43 +22964,38 @@ define void @s_shuffle_v4i64_v4i64__7_u_5_5() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_u_5_5:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_4_5_5:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 poison, i32 5, i32 5>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 4, i32 5, i32 5>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__7_0_5_5() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_0_5_5:
+define void @s_shuffle_v4i64_v4i64__7_6_5_5() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_6_5_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s18
 ; GFX900-NEXT:    s_mov_b32 s9, s19
-; GFX900-NEXT:    s_mov_b32 s10, s4
-; GFX900-NEXT:    s_mov_b32 s11, s5
+; GFX900-NEXT:    s_mov_b32 s10, s16
+; GFX900-NEXT:    s_mov_b32 s11, s17
 ; GFX900-NEXT:    s_mov_b32 s12, s14
 ; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -22275,19 +23003,16 @@ define void @s_shuffle_v4i64_v4i64__7_0_5_5() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_0_5_5:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_6_5_5:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s18
 ; GFX90A-NEXT:    s_mov_b32 s9, s19
-; GFX90A-NEXT:    s_mov_b32 s10, s4
-; GFX90A-NEXT:    s_mov_b32 s11, s5
+; GFX90A-NEXT:    s_mov_b32 s10, s16
+; GFX90A-NEXT:    s_mov_b32 s11, s17
 ; GFX90A-NEXT:    s_mov_b32 s12, s14
 ; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -22295,114 +23020,92 @@ define void @s_shuffle_v4i64_v4i64__7_0_5_5() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_0_5_5:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_6_5_5:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[4:11]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
-; GFX942-NEXT:    s_mov_b32 s14, s6
-; GFX942-NEXT:    s_mov_b32 s15, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 0, i32 5, i32 5>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 6, i32 5, i32 5>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__7_1_5_5() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_1_5_5:
+define void @s_shuffle_v4i64_v4i64__7_7_5_5() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_5_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s18
-; GFX900-NEXT:    s_mov_b32 s9, s19
-; GFX900-NEXT:    s_mov_b32 s12, s14
-; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s12, s6
+; GFX900-NEXT:    s_mov_b32 s13, s7
+; GFX900-NEXT:    s_mov_b32 s14, s6
+; GFX900-NEXT:    s_mov_b32 s15, s7
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_1_5_5:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_5_5:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s18
-; GFX90A-NEXT:    s_mov_b32 s9, s19
-; GFX90A-NEXT:    s_mov_b32 s12, s14
-; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s12, s6
+; GFX90A-NEXT:    s_mov_b32 s13, s7
+; GFX90A-NEXT:    s_mov_b32 s14, s6
+; GFX90A-NEXT:    s_mov_b32 s15, s7
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_1_5_5:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_5_5:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[8:15]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 1, i32 5, i32 5>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 5, i32 5>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__7_2_5_5() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_2_5_5:
+define void @s_shuffle_v4i64_v4i64__7_7_u_5() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_u_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s10
 ; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s10, s12
-; GFX900-NEXT:    s_mov_b32 s11, s13
-; GFX900-NEXT:    s_mov_b32 s12, s6
-; GFX900-NEXT:    s_mov_b32 s13, s7
 ; GFX900-NEXT:    s_mov_b32 s14, s6
 ; GFX900-NEXT:    s_mov_b32 s15, s7
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -22410,21 +23113,14 @@ define void @s_shuffle_v4i64_v4i64__7_2_5_5() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_2_5_5:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_u_5:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s10
 ; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s10, s12
-; GFX90A-NEXT:    s_mov_b32 s11, s13
-; GFX90A-NEXT:    s_mov_b32 s12, s6
-; GFX90A-NEXT:    s_mov_b32 s13, s7
 ; GFX90A-NEXT:    s_mov_b32 s14, s6
 ; GFX90A-NEXT:    s_mov_b32 s15, s7
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -22432,34 +23128,28 @@ define void @s_shuffle_v4i64_v4i64__7_2_5_5() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_2_5_5:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_u_5:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[12:19]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s18
-; GFX942-NEXT:    s_mov_b32 s9, s19
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 2, i32 5, i32 5>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 poison, i32 5>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__7_3_5_5() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_3_5_5:
+define void @s_shuffle_v4i64_v4i64__7_7_0_5() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_0_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -22470,14 +23160,16 @@ define void @s_shuffle_v4i64_v4i64__7_3_5_5() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s18
 ; GFX900-NEXT:    s_mov_b32 s9, s19
-; GFX900-NEXT:    s_mov_b32 s12, s14
-; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    s_mov_b32 s10, s18
+; GFX900-NEXT:    s_mov_b32 s11, s19
+; GFX900-NEXT:    s_mov_b32 s12, s4
+; GFX900-NEXT:    s_mov_b32 s13, s5
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_3_5_5:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_0_5:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -22488,174 +23180,176 @@ define void @s_shuffle_v4i64_v4i64__7_3_5_5() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s18
 ; GFX90A-NEXT:    s_mov_b32 s9, s19
-; GFX90A-NEXT:    s_mov_b32 s12, s14
-; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    s_mov_b32 s10, s18
+; GFX90A-NEXT:    s_mov_b32 s11, s19
+; GFX90A-NEXT:    s_mov_b32 s12, s4
+; GFX90A-NEXT:    s_mov_b32 s13, s5
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_3_5_5:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_0_5:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[12:19]
+; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s18
-; GFX942-NEXT:    s_mov_b32 s9, s19
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 3, i32 5, i32 5>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 0, i32 5>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__7_4_5_5() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_4_5_5:
+define void @s_shuffle_v4i64_v4i64__7_7_1_5() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_1_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s10, s4
-; GFX900-NEXT:    s_mov_b32 s11, s5
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s18
+; GFX900-NEXT:    s_mov_b32 s9, s19
+; GFX900-NEXT:    s_mov_b32 s10, s18
+; GFX900-NEXT:    s_mov_b32 s11, s19
 ; GFX900-NEXT:    s_mov_b32 s12, s6
 ; GFX900-NEXT:    s_mov_b32 s13, s7
-; GFX900-NEXT:    s_mov_b32 s14, s6
-; GFX900-NEXT:    s_mov_b32 s15, s7
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_4_5_5:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_1_5:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s10, s4
-; GFX90A-NEXT:    s_mov_b32 s11, s5
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s18
+; GFX90A-NEXT:    s_mov_b32 s9, s19
+; GFX90A-NEXT:    s_mov_b32 s10, s18
+; GFX90A-NEXT:    s_mov_b32 s11, s19
 ; GFX90A-NEXT:    s_mov_b32 s12, s6
 ; GFX90A-NEXT:    s_mov_b32 s13, s7
-; GFX90A-NEXT:    s_mov_b32 s14, s6
-; GFX90A-NEXT:    s_mov_b32 s15, s7
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_4_5_5:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_1_5:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[4:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 4, i32 5, i32 5>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 1, i32 5>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__7_6_5_5() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_6_5_5:
+define void @s_shuffle_v4i64_v4i64__7_7_2_5() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_2_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s18
-; GFX900-NEXT:    s_mov_b32 s9, s19
-; GFX900-NEXT:    s_mov_b32 s10, s16
-; GFX900-NEXT:    s_mov_b32 s11, s17
-; GFX900-NEXT:    s_mov_b32 s12, s14
-; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s14, s6
+; GFX900-NEXT:    s_mov_b32 s15, s7
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_6_5_5:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_2_5:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s18
-; GFX90A-NEXT:    s_mov_b32 s9, s19
-; GFX90A-NEXT:    s_mov_b32 s10, s16
-; GFX90A-NEXT:    s_mov_b32 s11, s17
-; GFX90A-NEXT:    s_mov_b32 s12, s14
-; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s6
+; GFX90A-NEXT:    s_mov_b32 s15, s7
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_6_5_5:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_2_5:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 6, i32 5, i32 5>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 2, i32 5>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__7_7_5_5() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_5_5:
+define void @s_shuffle_v4i64_v4i64__7_7_3_5() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_3_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s10
 ; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s12, s6
-; GFX900-NEXT:    s_mov_b32 s13, s7
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    s_mov_b32 s14, s6
 ; GFX900-NEXT:    s_mov_b32 s15, s7
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -22663,16 +23357,19 @@ define void @s_shuffle_v4i64_v4i64__7_7_5_5() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_5_5:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_3_5:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s10
 ; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s12, s6
-; GFX90A-NEXT:    s_mov_b32 s13, s7
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    s_mov_b32 s14, s6
 ; GFX90A-NEXT:    s_mov_b32 s15, s7
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -22680,33 +23377,31 @@ define void @s_shuffle_v4i64_v4i64__7_7_5_5() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_5_5:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_3_5:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[12:19]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[18:19]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[18:19]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 5, i32 5>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 3, i32 5>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__7_7_u_5() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_u_5:
+define void @s_shuffle_v4i64_v4i64__7_7_4_5() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_4_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -22714,6 +23409,8 @@ define void @s_shuffle_v4i64_v4i64__7_7_u_5() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s10
 ; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s12, s4
+; GFX900-NEXT:    s_mov_b32 s13, s5
 ; GFX900-NEXT:    s_mov_b32 s14, s6
 ; GFX900-NEXT:    s_mov_b32 s15, s7
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -22721,7 +23418,7 @@ define void @s_shuffle_v4i64_v4i64__7_7_u_5() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_u_5:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_4_5:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -22729,6 +23426,8 @@ define void @s_shuffle_v4i64_v4i64__7_7_u_5() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s10
 ; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s12, s4
+; GFX90A-NEXT:    s_mov_b32 s13, s5
 ; GFX90A-NEXT:    s_mov_b32 s14, s6
 ; GFX90A-NEXT:    s_mov_b32 s15, s7
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -22736,201 +23435,175 @@ define void @s_shuffle_v4i64_v4i64__7_7_u_5() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_u_5:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_4_5:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 poison, i32 5>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 4, i32 5>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__7_7_0_5() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_0_5:
+define void @s_shuffle_v4i64_v4i64__7_7_6_5() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_6_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s18
 ; GFX900-NEXT:    s_mov_b32 s9, s19
 ; GFX900-NEXT:    s_mov_b32 s10, s18
 ; GFX900-NEXT:    s_mov_b32 s11, s19
-; GFX900-NEXT:    s_mov_b32 s12, s4
-; GFX900-NEXT:    s_mov_b32 s13, s5
+; GFX900-NEXT:    s_mov_b32 s12, s16
+; GFX900-NEXT:    s_mov_b32 s13, s17
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_0_5:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_6_5:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s18
 ; GFX90A-NEXT:    s_mov_b32 s9, s19
 ; GFX90A-NEXT:    s_mov_b32 s10, s18
 ; GFX90A-NEXT:    s_mov_b32 s11, s19
-; GFX90A-NEXT:    s_mov_b32 s12, s4
-; GFX90A-NEXT:    s_mov_b32 s13, s5
+; GFX90A-NEXT:    s_mov_b32 s12, s16
+; GFX90A-NEXT:    s_mov_b32 s13, s17
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_0_5:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_6_5:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[4:11]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s6
-; GFX942-NEXT:    s_mov_b32 s15, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 0, i32 5>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 6, i32 5>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__7_7_1_5() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_1_5:
+define void @s_shuffle_v4i64_v4i64__u_6_6_6() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__u_6_6_6:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s18
-; GFX900-NEXT:    s_mov_b32 s9, s19
-; GFX900-NEXT:    s_mov_b32 s10, s18
-; GFX900-NEXT:    s_mov_b32 s11, s19
-; GFX900-NEXT:    s_mov_b32 s12, s6
-; GFX900-NEXT:    s_mov_b32 s13, s7
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_1_5:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__u_6_6_6:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s18
-; GFX90A-NEXT:    s_mov_b32 s9, s19
-; GFX90A-NEXT:    s_mov_b32 s10, s18
-; GFX90A-NEXT:    s_mov_b32 s11, s19
-; GFX90A-NEXT:    s_mov_b32 s12, s6
-; GFX90A-NEXT:    s_mov_b32 s13, s7
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_1_5:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__u_6_6_6:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[4:11]
+; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s6
-; GFX942-NEXT:    s_mov_b32 s15, s7
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 1, i32 5>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 poison, i32 6, i32 6, i32 6>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__7_7_2_5() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_2_5:
+define void @s_shuffle_v4i64_v4i64__0_6_6_6() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__0_6_6_6:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s14, s6
-; GFX900-NEXT:    s_mov_b32 s15, s7
+; GFX900-NEXT:    s_mov_b32 s10, s16
+; GFX900-NEXT:    s_mov_b32 s11, s17
+; GFX900-NEXT:    s_mov_b32 s12, s16
+; GFX900-NEXT:    s_mov_b32 s13, s17
+; GFX900-NEXT:    s_mov_b32 s14, s16
+; GFX900-NEXT:    s_mov_b32 s15, s17
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_2_5:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__0_6_6_6:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s14, s6
-; GFX90A-NEXT:    s_mov_b32 s15, s7
+; GFX90A-NEXT:    s_mov_b32 s10, s16
+; GFX90A-NEXT:    s_mov_b32 s11, s17
+; GFX90A-NEXT:    s_mov_b32 s12, s16
+; GFX90A-NEXT:    s_mov_b32 s13, s17
+; GFX90A-NEXT:    s_mov_b32 s14, s16
+; GFX90A-NEXT:    s_mov_b32 s15, s17
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_2_5:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__0_6_6_6:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -22939,311 +23612,271 @@ define void @s_shuffle_v4i64_v4i64__7_7_2_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 2, i32 5>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 0, i32 6, i32 6, i32 6>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__7_7_3_5() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_3_5:
+define void @s_shuffle_v4i64_v4i64__1_6_6_6() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__1_6_6_6:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s12, s14
-; GFX900-NEXT:    s_mov_b32 s13, s15
-; GFX900-NEXT:    s_mov_b32 s14, s6
-; GFX900-NEXT:    s_mov_b32 s15, s7
+; GFX900-NEXT:    s_mov_b32 s8, s6
+; GFX900-NEXT:    s_mov_b32 s9, s7
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_3_5:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__1_6_6_6:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s12, s14
-; GFX90A-NEXT:    s_mov_b32 s13, s15
-; GFX90A-NEXT:    s_mov_b32 s14, s6
-; GFX90A-NEXT:    s_mov_b32 s15, s7
+; GFX90A-NEXT:    s_mov_b32 s8, s6
+; GFX90A-NEXT:    s_mov_b32 s9, s7
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_3_5:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__1_6_6_6:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[12:19]
+; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s18
-; GFX942-NEXT:    s_mov_b32 s9, s19
-; GFX942-NEXT:    s_mov_b32 s10, s18
-; GFX942-NEXT:    s_mov_b32 s11, s19
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 3, i32 5>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 1, i32 6, i32 6, i32 6>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__7_7_4_5() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_4_5:
+define void @s_shuffle_v4i64_v4i64__2_6_6_6() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__2_6_6_6:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s12, s4
-; GFX900-NEXT:    s_mov_b32 s13, s5
-; GFX900-NEXT:    s_mov_b32 s14, s6
-; GFX900-NEXT:    s_mov_b32 s15, s7
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s16
+; GFX900-NEXT:    s_mov_b32 s9, s17
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_4_5:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__2_6_6_6:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s12, s4
-; GFX90A-NEXT:    s_mov_b32 s13, s5
-; GFX90A-NEXT:    s_mov_b32 s14, s6
-; GFX90A-NEXT:    s_mov_b32 s15, s7
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s16
+; GFX90A-NEXT:    s_mov_b32 s9, s17
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_4_5:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__2_6_6_6:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 4, i32 5>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 2, i32 6, i32 6, i32 6>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__7_7_6_5() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_6_5:
+define void @s_shuffle_v4i64_v4i64__3_6_6_6() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__3_6_6_6:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s18
 ; GFX900-NEXT:    s_mov_b32 s9, s19
-; GFX900-NEXT:    s_mov_b32 s10, s18
-; GFX900-NEXT:    s_mov_b32 s11, s19
-; GFX900-NEXT:    s_mov_b32 s12, s16
-; GFX900-NEXT:    s_mov_b32 s13, s17
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_6_5:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__3_6_6_6:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s18
 ; GFX90A-NEXT:    s_mov_b32 s9, s19
-; GFX90A-NEXT:    s_mov_b32 s10, s18
-; GFX90A-NEXT:    s_mov_b32 s11, s19
-; GFX90A-NEXT:    s_mov_b32 s12, s16
-; GFX90A-NEXT:    s_mov_b32 s13, s17
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_6_5:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__3_6_6_6:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s4
-; GFX942-NEXT:    s_mov_b32 s13, s5
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 6, i32 5>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4i64_v4i64__u_6_6_6() {
-; GFX9-LABEL: s_shuffle_v4i64_v4i64__u_6_6_6:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    s_mov_b32 s14, s12
-; GFX9-NEXT:    s_mov_b32 s15, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 poison, i32 6, i32 6, i32 6>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 3, i32 6, i32 6, i32 6>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__0_6_6_6() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__0_6_6_6:
+define void @s_shuffle_v4i64_v4i64__4_6_6_6() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__4_6_6_6:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s10, s16
-; GFX900-NEXT:    s_mov_b32 s11, s17
-; GFX900-NEXT:    s_mov_b32 s12, s16
-; GFX900-NEXT:    s_mov_b32 s13, s17
-; GFX900-NEXT:    s_mov_b32 s14, s16
-; GFX900-NEXT:    s_mov_b32 s15, s17
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__0_6_6_6:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__4_6_6_6:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s10, s16
-; GFX90A-NEXT:    s_mov_b32 s11, s17
-; GFX90A-NEXT:    s_mov_b32 s12, s16
-; GFX90A-NEXT:    s_mov_b32 s13, s17
-; GFX90A-NEXT:    s_mov_b32 s14, s16
-; GFX90A-NEXT:    s_mov_b32 s15, s17
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__0_6_6_6:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__4_6_6_6:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s4
-; GFX942-NEXT:    s_mov_b32 s13, s5
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 0, i32 6, i32 6, i32 6>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 4, i32 6, i32 6, i32 6>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__1_6_6_6() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__1_6_6_6:
+define void @s_shuffle_v4i64_v4i64__5_6_6_6() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__5_6_6_6:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s6
-; GFX900-NEXT:    s_mov_b32 s9, s7
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
 ; GFX900-NEXT:    s_mov_b32 s10, s12
 ; GFX900-NEXT:    s_mov_b32 s11, s13
 ; GFX900-NEXT:    s_mov_b32 s14, s12
@@ -23253,17 +23886,14 @@ define void @s_shuffle_v4i64_v4i64__1_6_6_6() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__1_6_6_6:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__5_6_6_6:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s6
-; GFX90A-NEXT:    s_mov_b32 s9, s7
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
 ; GFX90A-NEXT:    s_mov_b32 s10, s12
 ; GFX90A-NEXT:    s_mov_b32 s11, s13
 ; GFX90A-NEXT:    s_mov_b32 s14, s12
@@ -23273,44 +23903,35 @@ define void @s_shuffle_v4i64_v4i64__1_6_6_6() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__1_6_6_6:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__5_6_6_6:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s14, s12
-; GFX942-NEXT:    s_mov_b32 s15, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 1, i32 6, i32 6, i32 6>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 5, i32 6, i32 6, i32 6>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__2_6_6_6() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__2_6_6_6:
+define void @s_shuffle_v4i64_v4i64__6_6_6_6() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__6_6_6_6:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s16
-; GFX900-NEXT:    s_mov_b32 s9, s17
+; GFX900-NEXT:    s_mov_b32 s8, s12
+; GFX900-NEXT:    s_mov_b32 s9, s13
 ; GFX900-NEXT:    s_mov_b32 s10, s12
 ; GFX900-NEXT:    s_mov_b32 s11, s13
 ; GFX900-NEXT:    s_mov_b32 s14, s12
@@ -23320,17 +23941,14 @@ define void @s_shuffle_v4i64_v4i64__2_6_6_6() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__2_6_6_6:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__6_6_6_6:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s16
-; GFX90A-NEXT:    s_mov_b32 s9, s17
+; GFX90A-NEXT:    s_mov_b32 s8, s12
+; GFX90A-NEXT:    s_mov_b32 s9, s13
 ; GFX90A-NEXT:    s_mov_b32 s10, s12
 ; GFX90A-NEXT:    s_mov_b32 s11, s13
 ; GFX90A-NEXT:    s_mov_b32 s14, s12
@@ -23340,44 +23958,35 @@ define void @s_shuffle_v4i64_v4i64__2_6_6_6() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__2_6_6_6:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__6_6_6_6:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s14, s12
-; GFX942-NEXT:    s_mov_b32 s15, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 2, i32 6, i32 6, i32 6>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 6, i32 6, i32 6, i32 6>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__3_6_6_6() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__3_6_6_6:
+define void @s_shuffle_v4i64_v4i64__7_6_6_6() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_6_6_6:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s18
-; GFX900-NEXT:    s_mov_b32 s9, s19
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
 ; GFX900-NEXT:    s_mov_b32 s10, s12
 ; GFX900-NEXT:    s_mov_b32 s11, s13
 ; GFX900-NEXT:    s_mov_b32 s14, s12
@@ -23387,17 +23996,14 @@ define void @s_shuffle_v4i64_v4i64__3_6_6_6() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__3_6_6_6:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_6_6_6:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s18
-; GFX90A-NEXT:    s_mov_b32 s9, s19
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
 ; GFX90A-NEXT:    s_mov_b32 s10, s12
 ; GFX90A-NEXT:    s_mov_b32 s11, s13
 ; GFX90A-NEXT:    s_mov_b32 s14, s12
@@ -23407,119 +24013,19 @@ define void @s_shuffle_v4i64_v4i64__3_6_6_6() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__3_6_6_6:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_6_6_6:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s14, s12
-; GFX942-NEXT:    s_mov_b32 s15, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 3, i32 6, i32 6, i32 6>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4i64_v4i64__4_6_6_6() {
-; GFX9-LABEL: s_shuffle_v4i64_v4i64__4_6_6_6:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    s_mov_b32 s14, s12
-; GFX9-NEXT:    s_mov_b32 s15, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 4, i32 6, i32 6, i32 6>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4i64_v4i64__5_6_6_6() {
-; GFX9-LABEL: s_shuffle_v4i64_v4i64__5_6_6_6:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    s_mov_b32 s14, s12
-; GFX9-NEXT:    s_mov_b32 s15, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 5, i32 6, i32 6, i32 6>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4i64_v4i64__6_6_6_6() {
-; GFX9-LABEL: s_shuffle_v4i64_v4i64__6_6_6_6:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s12
-; GFX9-NEXT:    s_mov_b32 s9, s13
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    s_mov_b32 s14, s12
-; GFX9-NEXT:    s_mov_b32 s15, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 6, i32 6, i32 6, i32 6>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4i64_v4i64__7_6_6_6() {
-; GFX9-LABEL: s_shuffle_v4i64_v4i64__7_6_6_6:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    s_mov_b32 s14, s12
-; GFX9-NEXT:    s_mov_b32 s15, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 6, i32 6, i32 6>
@@ -23528,20 +24034,48 @@ define void @s_shuffle_v4i64_v4i64__7_6_6_6() {
 }
 
 define void @s_shuffle_v4i64_v4i64__7_u_6_6() {
-; GFX9-LABEL: s_shuffle_v4i64_v4i64__7_u_6_6:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    s_mov_b32 s14, s12
-; GFX9-NEXT:    s_mov_b32 s15, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_u_6_6:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_u_6_6:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_u_6_6:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 poison, i32 6, i32 6>
@@ -23599,12 +24133,9 @@ define void @s_shuffle_v4i64_v4i64__7_0_6_6() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s14, s12
-; GFX942-NEXT:    s_mov_b32 s15, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -23666,12 +24197,9 @@ define void @s_shuffle_v4i64_v4i64__7_1_6_6() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s12, s4
-; GFX942-NEXT:    s_mov_b32 s13, s5
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -23733,12 +24261,9 @@ define void @s_shuffle_v4i64_v4i64__7_2_6_6() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s14, s12
-; GFX942-NEXT:    s_mov_b32 s15, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -23800,12 +24325,9 @@ define void @s_shuffle_v4i64_v4i64__7_3_6_6() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s14, s12
-; GFX942-NEXT:    s_mov_b32 s15, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -23862,14 +24384,10 @@ define void @s_shuffle_v4i64_v4i64__7_4_6_6() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s4
-; GFX942-NEXT:    s_mov_b32 s13, s5
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -23882,20 +24400,48 @@ define void @s_shuffle_v4i64_v4i64__7_4_6_6() {
 }
 
 define void @s_shuffle_v4i64_v4i64__7_5_6_6() {
-; GFX9-LABEL: s_shuffle_v4i64_v4i64__7_5_6_6:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    s_mov_b32 s14, s12
-; GFX9-NEXT:    s_mov_b32 s15, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_5_6_6:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_5_6_6:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_5_6_6:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 5, i32 6, i32 6>
@@ -23904,22 +24450,53 @@ define void @s_shuffle_v4i64_v4i64__7_5_6_6() {
 }
 
 define void @s_shuffle_v4i64_v4i64__7_7_6_6() {
-; GFX9-LABEL: s_shuffle_v4i64_v4i64__7_7_6_6:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    s_mov_b32 s10, s14
-; GFX9-NEXT:    s_mov_b32 s11, s15
-; GFX9-NEXT:    s_mov_b32 s14, s12
-; GFX9-NEXT:    s_mov_b32 s15, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_6_6:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_6_6:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_6_6:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 6, i32 6>
@@ -23968,12 +24545,9 @@ define void @s_shuffle_v4i64_v4i64__7_7_u_6() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -24039,14 +24613,10 @@ define void @s_shuffle_v4i64_v4i64__7_7_0_6() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s18
-; GFX942-NEXT:    s_mov_b32 s9, s19
-; GFX942-NEXT:    s_mov_b32 s10, s18
-; GFX942-NEXT:    s_mov_b32 s11, s19
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s16
-; GFX942-NEXT:    s_mov_b32 s15, s17
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[18:19]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[18:19]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[16:17]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -24112,14 +24682,10 @@ define void @s_shuffle_v4i64_v4i64__7_7_1_6() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s18
-; GFX942-NEXT:    s_mov_b32 s9, s19
-; GFX942-NEXT:    s_mov_b32 s10, s18
-; GFX942-NEXT:    s_mov_b32 s11, s19
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s16
-; GFX942-NEXT:    s_mov_b32 s15, s17
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[18:19]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[18:19]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[16:17]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -24181,12 +24747,9 @@ define void @s_shuffle_v4i64_v4i64__7_7_2_6() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -24252,14 +24815,10 @@ define void @s_shuffle_v4i64_v4i64__7_7_3_6() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s18
-; GFX942-NEXT:    s_mov_b32 s9, s19
-; GFX942-NEXT:    s_mov_b32 s10, s18
-; GFX942-NEXT:    s_mov_b32 s11, s19
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
-; GFX942-NEXT:    s_mov_b32 s14, s16
-; GFX942-NEXT:    s_mov_b32 s15, s17
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[18:19]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[18:19]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[16:17]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -24312,14 +24871,10 @@ define void @s_shuffle_v4i64_v4i64__7_7_4_6() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -24376,14 +24931,10 @@ define void @s_shuffle_v4i64_v4i64__7_7_5_6() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -24396,20 +24947,48 @@ define void @s_shuffle_v4i64_v4i64__7_7_5_6() {
 }
 
 define void @s_shuffle_v4i64_v4i64__u_7_7_7() {
-; GFX9-LABEL: s_shuffle_v4i64_v4i64__u_7_7_7:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s14
-; GFX9-NEXT:    s_mov_b32 s11, s15
-; GFX9-NEXT:    s_mov_b32 s12, s14
-; GFX9-NEXT:    s_mov_b32 s13, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__u_7_7_7:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__u_7_7_7:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__u_7_7_7:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 poison, i32 7, i32 7, i32 7>
@@ -24467,12 +25046,9 @@ define void @s_shuffle_v4i64_v4i64__0_7_7_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
-; GFX942-NEXT:    s_mov_b32 s14, s6
-; GFX942-NEXT:    s_mov_b32 s15, s7
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -24534,12 +25110,9 @@ define void @s_shuffle_v4i64_v4i64__1_7_7_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -24601,12 +25174,9 @@ define void @s_shuffle_v4i64_v4i64__2_7_7_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -24659,117 +25229,232 @@ define void @s_shuffle_v4i64_v4i64__3_7_7_7() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__3_7_7_7:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__3_7_7_7:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x i64> asm "; def $0", "=s"()
+  %vec1 = call <4 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 3, i32 7, i32 7, i32 7>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4i64_v4i64__4_7_7_7() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__4_7_7_7:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__4_7_7_7:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__4_7_7_7:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x i64> asm "; def $0", "=s"()
+  %vec1 = call <4 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 4, i32 7, i32 7, i32 7>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4i64_v4i64__5_7_7_7() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__5_7_7_7:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__5_7_7_7:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__5_7_7_7:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x i64> asm "; def $0", "=s"()
+  %vec1 = call <4 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 5, i32 7, i32 7, i32 7>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4i64_v4i64__6_7_7_7() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__6_7_7_7:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s12
+; GFX900-NEXT:    s_mov_b32 s9, s13
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__6_7_7_7:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s12
+; GFX90A-NEXT:    s_mov_b32 s9, s13
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__6_7_7_7:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x i64> asm "; def $0", "=s"()
+  %vec1 = call <4 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 6, i32 7, i32 7, i32 7>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4i64_v4i64__7_u_7_7() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_u_7_7:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_u_7_7:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_u_7_7:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 3, i32 7, i32 7, i32 7>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4i64_v4i64__4_7_7_7() {
-; GFX9-LABEL: s_shuffle_v4i64_v4i64__4_7_7_7:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s14
-; GFX9-NEXT:    s_mov_b32 s11, s15
-; GFX9-NEXT:    s_mov_b32 s12, s14
-; GFX9-NEXT:    s_mov_b32 s13, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 4, i32 7, i32 7, i32 7>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4i64_v4i64__5_7_7_7() {
-; GFX9-LABEL: s_shuffle_v4i64_v4i64__5_7_7_7:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    s_mov_b32 s10, s14
-; GFX9-NEXT:    s_mov_b32 s11, s15
-; GFX9-NEXT:    s_mov_b32 s12, s14
-; GFX9-NEXT:    s_mov_b32 s13, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 5, i32 7, i32 7, i32 7>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4i64_v4i64__6_7_7_7() {
-; GFX9-LABEL: s_shuffle_v4i64_v4i64__6_7_7_7:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s12
-; GFX9-NEXT:    s_mov_b32 s9, s13
-; GFX9-NEXT:    s_mov_b32 s10, s14
-; GFX9-NEXT:    s_mov_b32 s11, s15
-; GFX9-NEXT:    s_mov_b32 s12, s14
-; GFX9-NEXT:    s_mov_b32 s13, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 6, i32 7, i32 7, i32 7>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4i64_v4i64__7_u_7_7() {
-; GFX9-LABEL: s_shuffle_v4i64_v4i64__7_u_7_7:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    s_mov_b32 s12, s14
-; GFX9-NEXT:    s_mov_b32 s13, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 poison, i32 7, i32 7>
@@ -24827,12 +25512,9 @@ define void @s_shuffle_v4i64_v4i64__7_0_7_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -24894,12 +25576,9 @@ define void @s_shuffle_v4i64_v4i64__7_1_7_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
-; GFX942-NEXT:    s_mov_b32 s14, s6
-; GFX942-NEXT:    s_mov_b32 s15, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -24961,12 +25640,9 @@ define void @s_shuffle_v4i64_v4i64__7_2_7_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -25028,12 +25704,9 @@ define void @s_shuffle_v4i64_v4i64__7_3_7_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -25090,14 +25763,10 @@ define void @s_shuffle_v4i64_v4i64__7_4_7_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
-; GFX942-NEXT:    s_mov_b32 s14, s6
-; GFX942-NEXT:    s_mov_b32 s15, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -25110,20 +25779,48 @@ define void @s_shuffle_v4i64_v4i64__7_4_7_7() {
 }
 
 define void @s_shuffle_v4i64_v4i64__7_5_7_7() {
-; GFX9-LABEL: s_shuffle_v4i64_v4i64__7_5_7_7:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    s_mov_b32 s12, s14
-; GFX9-NEXT:    s_mov_b32 s13, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_5_7_7:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_5_7_7:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_5_7_7:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 5, i32 7, i32 7>
@@ -25132,22 +25829,53 @@ define void @s_shuffle_v4i64_v4i64__7_5_7_7() {
 }
 
 define void @s_shuffle_v4i64_v4i64__7_6_7_7() {
-; GFX9-LABEL: s_shuffle_v4i64_v4i64__7_6_7_7:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    s_mov_b32 s12, s14
-; GFX9-NEXT:    s_mov_b32 s13, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_6_7_7:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_6_7_7:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_6_7_7:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 6, i32 7, i32 7>
@@ -25156,20 +25884,48 @@ define void @s_shuffle_v4i64_v4i64__7_6_7_7() {
 }
 
 define void @s_shuffle_v4i64_v4i64__7_7_u_7() {
-; GFX9-LABEL: s_shuffle_v4i64_v4i64__7_7_u_7:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    s_mov_b32 s10, s14
-; GFX9-NEXT:    s_mov_b32 s11, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_u_7:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_u_7:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_u_7:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 poison, i32 7>
@@ -25227,12 +25983,9 @@ define void @s_shuffle_v4i64_v4i64__7_7_0_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -25294,12 +26047,9 @@ define void @s_shuffle_v4i64_v4i64__7_7_1_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -25361,12 +26111,9 @@ define void @s_shuffle_v4i64_v4i64__7_7_2_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s14, s6
-; GFX942-NEXT:    s_mov_b32 s15, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -25428,12 +26175,9 @@ define void @s_shuffle_v4i64_v4i64__7_7_3_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -25486,14 +26230,10 @@ define void @s_shuffle_v4i64_v4i64__7_7_4_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s6
-; GFX942-NEXT:    s_mov_b32 s15, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -25550,14 +26290,10 @@ define void @s_shuffle_v4i64_v4i64__7_7_5_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s6
-; GFX942-NEXT:    s_mov_b32 s15, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -25570,20 +26306,48 @@ define void @s_shuffle_v4i64_v4i64__7_7_5_7() {
 }
 
 define void @s_shuffle_v4i64_v4i64__7_7_6_7() {
-; GFX9-LABEL: s_shuffle_v4i64_v4i64__7_7_6_7:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    s_mov_b32 s10, s14
-; GFX9-NEXT:    s_mov_b32 s11, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_6_7:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_6_7:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_6_7:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 6, i32 7>
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v2p0.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v2p0.ll
index b30af835a7882..e34b6e95da51c 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v2p0.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v2p0.ll
@@ -45,6 +45,7 @@ define void @v_shuffle_v4p0_v2p0__0_u_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -84,10 +85,12 @@ define void @v_shuffle_v4p0_v2p0__1_u_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v4p0_v2p0__1_u_u_u:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
+; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX942-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX942-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
@@ -140,10 +143,12 @@ define void @v_shuffle_v4p0_v2p0__3_u_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v4p0_v2p0__3_u_u_u:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
+; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX942-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX942-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
@@ -203,7 +208,8 @@ define void @v_shuffle_v4p0_v2p0__3_0_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[2:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1] offset:16
+; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX942-NEXT:    v_mov_b32_e32 v3, v5
 ; GFX942-NEXT:    v_mov_b32_e32 v4, v0
@@ -254,14 +260,15 @@ define void @v_shuffle_v4p0_v2p0__3_1_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v4p0_v2p0__3_1_u_u:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[4:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1] offset:16
+; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_mov_b32_e32 v0, v6
 ; GFX942-NEXT:    v_mov_b32_e32 v1, v7
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
@@ -308,7 +315,7 @@ define void @v_shuffle_v4p0_v2p0__3_2_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1] offset:16
 ; GFX942-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX942-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[2:5], s[0:1]
@@ -351,10 +358,12 @@ define void @v_shuffle_v4p0_v2p0__3_3_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v4p0_v2p0__3_3_u_u:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
+; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX942-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX942-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
@@ -2362,6 +2371,7 @@ define void @v_shuffle_v4p0_v2p0__0_2_2_2(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -2401,10 +2411,12 @@ define void @v_shuffle_v4p0_v2p0__1_2_2_2(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v4p0_v2p0__1_2_2_2:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
+; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX942-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX942-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
@@ -3734,8 +3746,7 @@ define void @s_shuffle_v4p0_v2p0__1_u_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3793,8 +3804,7 @@ define void @s_shuffle_v4p0_v2p0__3_u_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3852,10 +3862,8 @@ define void @s_shuffle_v4p0_v2p0__3_0_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3909,8 +3917,7 @@ define void @s_shuffle_v4p0_v2p0__3_1_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3959,10 +3966,8 @@ define void @s_shuffle_v4p0_v2p0__3_2_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3975,18 +3980,43 @@ define void @s_shuffle_v4p0_v2p0__3_2_u_u() {
 }
 
 define void @s_shuffle_v4p0_v2p0__3_3_u_u() {
-; GFX9-LABEL: s_shuffle_v4p0_v2p0__3_3_u_u:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4p0_v2p0__3_3_u_u:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v2p0__3_3_u_u:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v2p0__3_3_u_u:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x ptr> asm "; def $0", "=s"()
   %vec1 = call <2 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> <i32 3, i32 3, i32 poison, i32 poison>
@@ -3995,21 +4025,52 @@ define void @s_shuffle_v4p0_v2p0__3_3_u_u() {
 }
 
 define void @s_shuffle_v4p0_v2p0__3_3_0_u() {
-; GFX9-LABEL: s_shuffle_v4p0_v2p0__3_3_0_u:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[12:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4p0_v2p0__3_3_0_u:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[12:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v2p0__3_3_0_u:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[12:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v2p0__3_3_0_u:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[12:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x ptr> asm "; def $0", "=s"()
   %vec1 = call <2 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> <i32 3, i32 3, i32 0, i32 poison>
@@ -4063,10 +4124,8 @@ define void @s_shuffle_v4p0_v2p0__3_3_1_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -4079,20 +4138,48 @@ define void @s_shuffle_v4p0_v2p0__3_3_1_u() {
 }
 
 define void @s_shuffle_v4p0_v2p0__3_3_2_u() {
-; GFX9-LABEL: s_shuffle_v4p0_v2p0__3_3_2_u:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[12:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    s_mov_b32 s10, s14
-; GFX9-NEXT:    s_mov_b32 s11, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4p0_v2p0__3_3_2_u:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[12:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v2p0__3_3_2_u:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[12:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v2p0__3_3_2_u:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[12:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x ptr> asm "; def $0", "=s"()
   %vec1 = call <2 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> <i32 3, i32 3, i32 2, i32 poison>
@@ -4101,20 +4188,48 @@ define void @s_shuffle_v4p0_v2p0__3_3_2_u() {
 }
 
 define void @s_shuffle_v4p0_v2p0__3_3_3_u() {
-; GFX9-LABEL: s_shuffle_v4p0_v2p0__3_3_3_u:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4p0_v2p0__3_3_3_u:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v2p0__3_3_3_u:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v2p0__3_3_3_u:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x ptr> asm "; def $0", "=s"()
   %vec1 = call <2 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> <i32 3, i32 3, i32 3, i32 poison>
@@ -4172,12 +4287,9 @@ define void @s_shuffle_v4p0_v2p0__3_3_3_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s12, s10
-; GFX942-NEXT:    s_mov_b32 s13, s11
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -4190,62 +4302,96 @@ define void @s_shuffle_v4p0_v2p0__3_3_3_0() {
 }
 
 define void @s_shuffle_v4p0_v2p0__3_3_3_1() {
-; GFX9-LABEL: s_shuffle_v4p0_v2p0__3_3_3_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[12:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <2 x ptr> asm "; def $0", "=s"()
-  %vec1 = call <2 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> <i32 3, i32 3, i32 3, i32 1>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4p0_v2p0__3_3_3_2() {
-; GFX900-LABEL: s_shuffle_v4p0_v2p0__3_3_3_2:
+; GFX900-LABEL: s_shuffle_v4p0_v2p0__3_3_3_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:7]
+; GFX900-NEXT:    ; def s[12:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s6
-; GFX900-NEXT:    s_mov_b32 s9, s7
-; GFX900-NEXT:    s_mov_b32 s10, s6
-; GFX900-NEXT:    s_mov_b32 s11, s7
-; GFX900-NEXT:    s_mov_b32 s12, s6
-; GFX900-NEXT:    s_mov_b32 s13, s7
-; GFX900-NEXT:    s_mov_b32 s14, s4
-; GFX900-NEXT:    s_mov_b32 s15, s5
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v2p0__3_3_3_2:
+; GFX90A-LABEL: s_shuffle_v4p0_v2p0__3_3_3_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:7]
+; GFX90A-NEXT:    ; def s[12:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s6
-; GFX90A-NEXT:    s_mov_b32 s9, s7
-; GFX90A-NEXT:    s_mov_b32 s10, s6
-; GFX90A-NEXT:    s_mov_b32 s11, s7
-; GFX90A-NEXT:    s_mov_b32 s12, s6
-; GFX90A-NEXT:    s_mov_b32 s13, s7
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v2p0__3_3_3_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[12:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <2 x ptr> asm "; def $0", "=s"()
+  %vec1 = call <2 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> <i32 3, i32 3, i32 3, i32 1>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4p0_v2p0__3_3_3_2() {
+; GFX900-LABEL: s_shuffle_v4p0_v2p0__3_3_3_2:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:7]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s6
+; GFX900-NEXT:    s_mov_b32 s9, s7
+; GFX900-NEXT:    s_mov_b32 s10, s6
+; GFX900-NEXT:    s_mov_b32 s11, s7
+; GFX900-NEXT:    s_mov_b32 s12, s6
+; GFX900-NEXT:    s_mov_b32 s13, s7
+; GFX900-NEXT:    s_mov_b32 s14, s4
+; GFX900-NEXT:    s_mov_b32 s15, s5
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v2p0__3_3_3_2:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:7]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s6
+; GFX90A-NEXT:    s_mov_b32 s9, s7
+; GFX90A-NEXT:    s_mov_b32 s10, s6
+; GFX90A-NEXT:    s_mov_b32 s11, s7
+; GFX90A-NEXT:    s_mov_b32 s12, s6
+; GFX90A-NEXT:    s_mov_b32 s13, s7
 ; GFX90A-NEXT:    s_mov_b32 s14, s4
 ; GFX90A-NEXT:    s_mov_b32 s15, s5
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -4259,14 +4405,10 @@ define void @s_shuffle_v4p0_v2p0__3_3_3_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s2
-; GFX942-NEXT:    s_mov_b32 s11, s3
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -4279,22 +4421,53 @@ define void @s_shuffle_v4p0_v2p0__3_3_3_2() {
 }
 
 define void @s_shuffle_v4p0_v2p0__3_3_3_3() {
-; GFX9-LABEL: s_shuffle_v4p0_v2p0__3_3_3_3:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    s_mov_b32 s14, s10
-; GFX9-NEXT:    s_mov_b32 s15, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4p0_v2p0__3_3_3_3:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    s_mov_b32 s14, s10
+; GFX900-NEXT:    s_mov_b32 s15, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v2p0__3_3_3_3:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s10
+; GFX90A-NEXT:    s_mov_b32 s15, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v2p0__3_3_3_3:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x ptr> asm "; def $0", "=s"()
   %vec1 = call <2 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
@@ -4303,20 +4476,48 @@ define void @s_shuffle_v4p0_v2p0__3_3_3_3() {
 }
 
 define void @s_shuffle_v4p0_v2p0__u_0_0_0() {
-; GFX9-LABEL: s_shuffle_v4p0_v2p0__u_0_0_0:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[12:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    s_mov_b32 s14, s12
-; GFX9-NEXT:    s_mov_b32 s15, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4p0_v2p0__u_0_0_0:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[12:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v2p0__u_0_0_0:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[12:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v2p0__u_0_0_0:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[12:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> poison, <4 x i32> <i32 poison, i32 0, i32 0, i32 0>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
@@ -4324,22 +4525,53 @@ define void @s_shuffle_v4p0_v2p0__u_0_0_0() {
 }
 
 define void @s_shuffle_v4p0_v2p0__0_0_0_0() {
-; GFX9-LABEL: s_shuffle_v4p0_v2p0__0_0_0_0:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s8
-; GFX9-NEXT:    s_mov_b32 s11, s9
-; GFX9-NEXT:    s_mov_b32 s12, s8
-; GFX9-NEXT:    s_mov_b32 s13, s9
-; GFX9-NEXT:    s_mov_b32 s14, s8
-; GFX9-NEXT:    s_mov_b32 s15, s9
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4p0_v2p0__0_0_0_0:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s8
+; GFX900-NEXT:    s_mov_b32 s11, s9
+; GFX900-NEXT:    s_mov_b32 s12, s8
+; GFX900-NEXT:    s_mov_b32 s13, s9
+; GFX900-NEXT:    s_mov_b32 s14, s8
+; GFX900-NEXT:    s_mov_b32 s15, s9
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v2p0__0_0_0_0:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s8
+; GFX90A-NEXT:    s_mov_b32 s11, s9
+; GFX90A-NEXT:    s_mov_b32 s12, s8
+; GFX90A-NEXT:    s_mov_b32 s13, s9
+; GFX90A-NEXT:    s_mov_b32 s14, s8
+; GFX90A-NEXT:    s_mov_b32 s15, s9
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v2p0__0_0_0_0:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[8:9]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[8:9]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[8:9]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> poison, <4 x i32> zeroinitializer
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
@@ -4347,61 +4579,14 @@ define void @s_shuffle_v4p0_v2p0__0_0_0_0() {
 }
 
 define void @s_shuffle_v4p0_v2p0__1_0_0_0() {
-; GFX9-LABEL: s_shuffle_v4p0_v2p0__1_0_0_0:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[12:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    s_mov_b32 s14, s12
-; GFX9-NEXT:    s_mov_b32 s15, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <2 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> poison, <4 x i32> <i32 1, i32 0, i32 0, i32 0>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4p0_v2p0__2_0_0_0() {
-; GFX9-LABEL: s_shuffle_v4p0_v2p0__2_0_0_0:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[12:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    s_mov_b32 s14, s12
-; GFX9-NEXT:    s_mov_b32 s15, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <2 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> poison, <4 x i32> <i32 2, i32 0, i32 0, i32 0>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4p0_v2p0__3_0_0_0() {
-; GFX900-LABEL: s_shuffle_v4p0_v2p0__3_0_0_0:
+; GFX900-LABEL: s_shuffle_v4p0_v2p0__1_0_0_0:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[12:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:7]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s6
-; GFX900-NEXT:    s_mov_b32 s9, s7
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
 ; GFX900-NEXT:    s_mov_b32 s10, s12
 ; GFX900-NEXT:    s_mov_b32 s11, s13
 ; GFX900-NEXT:    s_mov_b32 s14, s12
@@ -4411,54 +4596,526 @@ define void @s_shuffle_v4p0_v2p0__3_0_0_0() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v2p0__3_0_0_0:
+; GFX90A-LABEL: s_shuffle_v4p0_v2p0__1_0_0_0:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[12:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v2p0__1_0_0_0:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[12:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <2 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> poison, <4 x i32> <i32 1, i32 0, i32 0, i32 0>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4p0_v2p0__2_0_0_0() {
+; GFX900-LABEL: s_shuffle_v4p0_v2p0__2_0_0_0:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[12:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v2p0__2_0_0_0:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[12:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v2p0__2_0_0_0:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[12:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <2 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> poison, <4 x i32> <i32 2, i32 0, i32 0, i32 0>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4p0_v2p0__3_0_0_0() {
+; GFX900-LABEL: s_shuffle_v4p0_v2p0__3_0_0_0:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[12:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:7]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s6
+; GFX900-NEXT:    s_mov_b32 s9, s7
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v2p0__3_0_0_0:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[12:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:7]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s6
+; GFX90A-NEXT:    s_mov_b32 s9, s7
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v2p0__3_0_0_0:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[12:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:3]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <2 x ptr> asm "; def $0", "=s"()
+  %vec1 = call <2 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> <i32 3, i32 0, i32 0, i32 0>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4p0_v2p0__3_u_0_0() {
+; GFX900-LABEL: s_shuffle_v4p0_v2p0__3_u_0_0:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[12:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:7]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s6
+; GFX900-NEXT:    s_mov_b32 s9, s7
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v2p0__3_u_0_0:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[12:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:7]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s6
+; GFX90A-NEXT:    s_mov_b32 s9, s7
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v2p0__3_u_0_0:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[12:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:3]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <2 x ptr> asm "; def $0", "=s"()
+  %vec1 = call <2 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> <i32 3, i32 poison, i32 0, i32 0>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4p0_v2p0__3_1_0_0() {
+; GFX900-LABEL: s_shuffle_v4p0_v2p0__3_1_0_0:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[12:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:7]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s6
+; GFX900-NEXT:    s_mov_b32 s9, s7
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v2p0__3_1_0_0:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[12:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:7]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s6
+; GFX90A-NEXT:    s_mov_b32 s9, s7
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v2p0__3_1_0_0:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[12:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:3]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <2 x ptr> asm "; def $0", "=s"()
+  %vec1 = call <2 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> <i32 3, i32 1, i32 0, i32 0>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4p0_v2p0__3_2_0_0() {
+; GFX900-LABEL: s_shuffle_v4p0_v2p0__3_2_0_0:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[12:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:7]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s6
+; GFX900-NEXT:    s_mov_b32 s9, s7
+; GFX900-NEXT:    s_mov_b32 s10, s4
+; GFX900-NEXT:    s_mov_b32 s11, s5
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v2p0__3_2_0_0:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[12:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:7]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s6
+; GFX90A-NEXT:    s_mov_b32 s9, s7
+; GFX90A-NEXT:    s_mov_b32 s10, s4
+; GFX90A-NEXT:    s_mov_b32 s11, s5
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v2p0__3_2_0_0:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[12:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:3]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <2 x ptr> asm "; def $0", "=s"()
+  %vec1 = call <2 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> <i32 3, i32 2, i32 0, i32 0>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4p0_v2p0__3_3_0_0() {
+; GFX900-LABEL: s_shuffle_v4p0_v2p0__3_3_0_0:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[12:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v2p0__3_3_0_0:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[12:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v2p0__3_3_0_0:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[12:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <2 x ptr> asm "; def $0", "=s"()
+  %vec1 = call <2 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> <i32 3, i32 3, i32 0, i32 0>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4p0_v2p0__3_3_u_0() {
+; GFX900-LABEL: s_shuffle_v4p0_v2p0__3_3_u_0:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:7]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s14, s4
+; GFX900-NEXT:    s_mov_b32 s15, s5
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v2p0__3_3_u_0:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:7]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s4
+; GFX90A-NEXT:    s_mov_b32 s15, s5
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v2p0__3_3_u_0:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:3]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <2 x ptr> asm "; def $0", "=s"()
+  %vec1 = call <2 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> <i32 3, i32 3, i32 poison, i32 0>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4p0_v2p0__3_3_1_0() {
+; GFX900-LABEL: s_shuffle_v4p0_v2p0__3_3_1_0:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:7]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s12, s6
+; GFX900-NEXT:    s_mov_b32 s13, s7
+; GFX900-NEXT:    s_mov_b32 s14, s4
+; GFX900-NEXT:    s_mov_b32 s15, s5
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v2p0__3_3_1_0:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:15]
+; GFX90A-NEXT:    ; def s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:7]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s6
-; GFX90A-NEXT:    s_mov_b32 s9, s7
-; GFX90A-NEXT:    s_mov_b32 s10, s12
-; GFX90A-NEXT:    s_mov_b32 s11, s13
-; GFX90A-NEXT:    s_mov_b32 s14, s12
-; GFX90A-NEXT:    s_mov_b32 s15, s13
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s12, s6
+; GFX90A-NEXT:    s_mov_b32 s13, s7
+; GFX90A-NEXT:    s_mov_b32 s14, s4
+; GFX90A-NEXT:    s_mov_b32 s15, s5
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v2p0__3_0_0_0:
+; GFX942-LABEL: s_shuffle_v4p0_v2p0__3_3_1_0:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[12:15]
+; GFX942-NEXT:    ; def s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s14, s12
-; GFX942-NEXT:    s_mov_b32 s15, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x ptr> asm "; def $0", "=s"()
   %vec1 = call <2 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> <i32 3, i32 0, i32 0, i32 0>
+  %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> <i32 3, i32 3, i32 1, i32 0>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v2p0__3_u_0_0() {
-; GFX900-LABEL: s_shuffle_v4p0_v2p0__3_u_0_0:
+define void @s_shuffle_v4p0_v2p0__3_3_2_0() {
+; GFX900-LABEL: s_shuffle_v4p0_v2p0__3_3_2_0:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -4467,16 +5124,18 @@ define void @s_shuffle_v4p0_v2p0__3_u_0_0() {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:7]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s6
-; GFX900-NEXT:    s_mov_b32 s9, s7
-; GFX900-NEXT:    s_mov_b32 s14, s12
-; GFX900-NEXT:    s_mov_b32 s15, s13
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    s_mov_b32 s14, s4
+; GFX900-NEXT:    s_mov_b32 s15, s5
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v2p0__3_u_0_0:
+; GFX90A-LABEL: s_shuffle_v4p0_v2p0__3_3_2_0:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -4485,16 +5144,18 @@ define void @s_shuffle_v4p0_v2p0__3_u_0_0() {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:7]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s6
-; GFX90A-NEXT:    s_mov_b32 s9, s7
-; GFX90A-NEXT:    s_mov_b32 s14, s12
-; GFX90A-NEXT:    s_mov_b32 s15, s13
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    s_mov_b32 s14, s4
+; GFX90A-NEXT:    s_mov_b32 s15, s5
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v2p0__3_u_0_0:
+; GFX942-LABEL: s_shuffle_v4p0_v2p0__3_3_2_0:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -4503,182 +5164,223 @@ define void @s_shuffle_v4p0_v2p0__3_u_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s14, s12
-; GFX942-NEXT:    s_mov_b32 s15, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x ptr> asm "; def $0", "=s"()
   %vec1 = call <2 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> <i32 3, i32 poison, i32 0, i32 0>
+  %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> <i32 3, i32 3, i32 2, i32 0>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v2p0__3_1_0_0() {
-; GFX900-LABEL: s_shuffle_v4p0_v2p0__3_1_0_0:
+define void @s_shuffle_v4p0_v2p0__u_1_1_1() {
+; GFX900-LABEL: s_shuffle_v4p0_v2p0__u_1_1_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:15]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:7]
+; GFX900-NEXT:    ; def s[8:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s6
-; GFX900-NEXT:    s_mov_b32 s9, s7
-; GFX900-NEXT:    s_mov_b32 s10, s14
-; GFX900-NEXT:    s_mov_b32 s11, s15
-; GFX900-NEXT:    s_mov_b32 s14, s12
-; GFX900-NEXT:    s_mov_b32 s15, s13
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    s_mov_b32 s14, s10
+; GFX900-NEXT:    s_mov_b32 s15, s11
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v2p0__3_1_0_0:
+; GFX90A-LABEL: s_shuffle_v4p0_v2p0__u_1_1_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:15]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:7]
+; GFX90A-NEXT:    ; def s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s6
-; GFX90A-NEXT:    s_mov_b32 s9, s7
-; GFX90A-NEXT:    s_mov_b32 s10, s14
-; GFX90A-NEXT:    s_mov_b32 s11, s15
-; GFX90A-NEXT:    s_mov_b32 s14, s12
-; GFX90A-NEXT:    s_mov_b32 s15, s13
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s10
+; GFX90A-NEXT:    s_mov_b32 s15, s11
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v2p0__3_1_0_0:
+; GFX942-LABEL: s_shuffle_v4p0_v2p0__u_1_1_1:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[12:15]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:3]
+; GFX942-NEXT:    ; def s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
-; GFX942-NEXT:    s_mov_b32 s14, s12
-; GFX942-NEXT:    s_mov_b32 s15, s13
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x ptr> asm "; def $0", "=s"()
-  %vec1 = call <2 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> <i32 3, i32 1, i32 0, i32 0>
+  %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> poison, <4 x i32> <i32 poison, i32 1, i32 1, i32 1>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v2p0__3_2_0_0() {
-; GFX900-LABEL: s_shuffle_v4p0_v2p0__3_2_0_0:
+define void @s_shuffle_v4p0_v2p0__0_1_1_1() {
+; GFX900-LABEL: s_shuffle_v4p0_v2p0__0_1_1_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:15]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:7]
+; GFX900-NEXT:    ; def s[8:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s6
-; GFX900-NEXT:    s_mov_b32 s9, s7
-; GFX900-NEXT:    s_mov_b32 s10, s4
-; GFX900-NEXT:    s_mov_b32 s11, s5
-; GFX900-NEXT:    s_mov_b32 s14, s12
-; GFX900-NEXT:    s_mov_b32 s15, s13
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    s_mov_b32 s14, s10
+; GFX900-NEXT:    s_mov_b32 s15, s11
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v2p0__3_2_0_0:
+; GFX90A-LABEL: s_shuffle_v4p0_v2p0__0_1_1_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:15]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:7]
+; GFX90A-NEXT:    ; def s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s6
-; GFX90A-NEXT:    s_mov_b32 s9, s7
-; GFX90A-NEXT:    s_mov_b32 s10, s4
-; GFX90A-NEXT:    s_mov_b32 s11, s5
-; GFX90A-NEXT:    s_mov_b32 s14, s12
-; GFX90A-NEXT:    s_mov_b32 s15, s13
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s10
+; GFX90A-NEXT:    s_mov_b32 s15, s11
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v2p0__3_2_0_0:
+; GFX942-LABEL: s_shuffle_v4p0_v2p0__0_1_1_1:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[12:15]
+; GFX942-NEXT:    ; def s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:3]
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <2 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> poison, <4 x i32> <i32 0, i32 1, i32 1, i32 1>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4p0_v2p0__1_1_1_1() {
+; GFX900-LABEL: s_shuffle_v4p0_v2p0__1_1_1_1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    s_mov_b32 s14, s10
+; GFX900-NEXT:    s_mov_b32 s15, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v2p0__1_1_1_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s10
+; GFX90A-NEXT:    s_mov_b32 s15, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v2p0__1_1_1_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s14, s12
-; GFX942-NEXT:    s_mov_b32 s15, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x ptr> asm "; def $0", "=s"()
-  %vec1 = call <2 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> <i32 3, i32 2, i32 0, i32 0>
+  %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v2p0__3_3_0_0() {
-; GFX9-LABEL: s_shuffle_v4p0_v2p0__3_3_0_0:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[12:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    s_mov_b32 s14, s12
-; GFX9-NEXT:    s_mov_b32 s15, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+define void @s_shuffle_v4p0_v2p0__2_1_1_1() {
+; GFX900-LABEL: s_shuffle_v4p0_v2p0__2_1_1_1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    s_mov_b32 s14, s10
+; GFX900-NEXT:    s_mov_b32 s15, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v2p0__2_1_1_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s10
+; GFX90A-NEXT:    s_mov_b32 s15, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v2p0__2_1_1_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x ptr> asm "; def $0", "=s"()
-  %vec1 = call <2 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> <i32 3, i32 3, i32 0, i32 0>
+  %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> poison, <4 x i32> <i32 2, i32 1, i32 1, i32 1>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v2p0__3_3_u_0() {
-; GFX900-LABEL: s_shuffle_v4p0_v2p0__3_3_u_0:
+define void @s_shuffle_v4p0_v2p0__3_1_1_1() {
+; GFX900-LABEL: s_shuffle_v4p0_v2p0__3_1_1_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -4687,16 +5389,18 @@ define void @s_shuffle_v4p0_v2p0__3_3_u_0() {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:7]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s14, s4
-; GFX900-NEXT:    s_mov_b32 s15, s5
+; GFX900-NEXT:    s_mov_b32 s8, s6
+; GFX900-NEXT:    s_mov_b32 s9, s7
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    s_mov_b32 s14, s10
+; GFX900-NEXT:    s_mov_b32 s15, s11
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v2p0__3_3_u_0:
+; GFX90A-LABEL: s_shuffle_v4p0_v2p0__3_1_1_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -4705,16 +5409,18 @@ define void @s_shuffle_v4p0_v2p0__3_3_u_0() {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:7]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s14, s4
-; GFX90A-NEXT:    s_mov_b32 s15, s5
+; GFX90A-NEXT:    s_mov_b32 s8, s6
+; GFX90A-NEXT:    s_mov_b32 s9, s7
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s10
+; GFX90A-NEXT:    s_mov_b32 s15, s11
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v2p0__3_3_u_0:
+; GFX942-LABEL: s_shuffle_v4p0_v2p0__3_1_1_1:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -4723,90 +5429,81 @@ define void @s_shuffle_v4p0_v2p0__3_3_u_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x ptr> asm "; def $0", "=s"()
   %vec1 = call <2 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> <i32 3, i32 3, i32 poison, i32 0>
+  %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> <i32 3, i32 1, i32 1, i32 1>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v2p0__3_3_1_0() {
-; GFX900-LABEL: s_shuffle_v4p0_v2p0__3_3_1_0:
+define void @s_shuffle_v4p0_v2p0__3_u_1_1() {
+; GFX900-LABEL: s_shuffle_v4p0_v2p0__3_u_1_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:11]
+; GFX900-NEXT:    ; def s[12:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:7]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s12, s6
-; GFX900-NEXT:    s_mov_b32 s13, s7
-; GFX900-NEXT:    s_mov_b32 s14, s4
-; GFX900-NEXT:    s_mov_b32 s15, s5
+; GFX900-NEXT:    s_mov_b32 s8, s6
+; GFX900-NEXT:    s_mov_b32 s9, s7
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v2p0__3_3_1_0:
+; GFX90A-LABEL: s_shuffle_v4p0_v2p0__3_u_1_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:11]
+; GFX90A-NEXT:    ; def s[12:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:7]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s12, s6
-; GFX90A-NEXT:    s_mov_b32 s13, s7
-; GFX90A-NEXT:    s_mov_b32 s14, s4
-; GFX90A-NEXT:    s_mov_b32 s15, s5
+; GFX90A-NEXT:    s_mov_b32 s8, s6
+; GFX90A-NEXT:    s_mov_b32 s9, s7
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v2p0__3_3_1_0:
+; GFX942-LABEL: s_shuffle_v4p0_v2p0__3_u_1_1:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[8:11]
+; GFX942-NEXT:    ; def s[12:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x ptr> asm "; def $0", "=s"()
   %vec1 = call <2 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> <i32 3, i32 3, i32 1, i32 0>
+  %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> <i32 3, i32 poison, i32 1, i32 1>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v2p0__3_3_2_0() {
-; GFX900-LABEL: s_shuffle_v4p0_v2p0__3_3_2_0:
+define void @s_shuffle_v4p0_v2p0__3_0_1_1() {
+; GFX900-LABEL: s_shuffle_v4p0_v2p0__3_0_1_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -4815,18 +5512,18 @@ define void @s_shuffle_v4p0_v2p0__3_3_2_0() {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:7]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s14
-; GFX900-NEXT:    s_mov_b32 s9, s15
-; GFX900-NEXT:    s_mov_b32 s10, s14
-; GFX900-NEXT:    s_mov_b32 s11, s15
-; GFX900-NEXT:    s_mov_b32 s14, s4
-; GFX900-NEXT:    s_mov_b32 s15, s5
+; GFX900-NEXT:    s_mov_b32 s8, s6
+; GFX900-NEXT:    s_mov_b32 s9, s7
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v2p0__3_3_2_0:
+; GFX90A-LABEL: s_shuffle_v4p0_v2p0__3_0_1_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -4835,18 +5532,18 @@ define void @s_shuffle_v4p0_v2p0__3_3_2_0() {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:7]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s14
-; GFX90A-NEXT:    s_mov_b32 s9, s15
-; GFX90A-NEXT:    s_mov_b32 s10, s14
-; GFX90A-NEXT:    s_mov_b32 s11, s15
-; GFX90A-NEXT:    s_mov_b32 s14, s4
-; GFX90A-NEXT:    s_mov_b32 s15, s5
+; GFX90A-NEXT:    s_mov_b32 s8, s6
+; GFX90A-NEXT:    s_mov_b32 s9, s7
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v2p0__3_3_2_0:
+; GFX942-LABEL: s_shuffle_v4p0_v2p0__3_0_1_1:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -4855,188 +5552,96 @@ define void @s_shuffle_v4p0_v2p0__3_3_2_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x ptr> asm "; def $0", "=s"()
   %vec1 = call <2 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> <i32 3, i32 3, i32 2, i32 0>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4p0_v2p0__u_1_1_1() {
-; GFX9-LABEL: s_shuffle_v4p0_v2p0__u_1_1_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    s_mov_b32 s14, s10
-; GFX9-NEXT:    s_mov_b32 s15, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <2 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> poison, <4 x i32> <i32 poison, i32 1, i32 1, i32 1>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4p0_v2p0__0_1_1_1() {
-; GFX9-LABEL: s_shuffle_v4p0_v2p0__0_1_1_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    s_mov_b32 s14, s10
-; GFX9-NEXT:    s_mov_b32 s15, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <2 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> poison, <4 x i32> <i32 0, i32 1, i32 1, i32 1>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4p0_v2p0__1_1_1_1() {
-; GFX9-LABEL: s_shuffle_v4p0_v2p0__1_1_1_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    s_mov_b32 s14, s10
-; GFX9-NEXT:    s_mov_b32 s15, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <2 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4p0_v2p0__2_1_1_1() {
-; GFX9-LABEL: s_shuffle_v4p0_v2p0__2_1_1_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    s_mov_b32 s14, s10
-; GFX9-NEXT:    s_mov_b32 s15, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <2 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> poison, <4 x i32> <i32 2, i32 1, i32 1, i32 1>
+  %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> <i32 3, i32 0, i32 1, i32 1>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v2p0__3_1_1_1() {
-; GFX900-LABEL: s_shuffle_v4p0_v2p0__3_1_1_1:
+define void @s_shuffle_v4p0_v2p0__3_2_1_1() {
+; GFX900-LABEL: s_shuffle_v4p0_v2p0__3_2_1_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:11]
+; GFX900-NEXT:    ; def s[12:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:7]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s6
 ; GFX900-NEXT:    s_mov_b32 s9, s7
-; GFX900-NEXT:    s_mov_b32 s12, s10
-; GFX900-NEXT:    s_mov_b32 s13, s11
-; GFX900-NEXT:    s_mov_b32 s14, s10
-; GFX900-NEXT:    s_mov_b32 s15, s11
+; GFX900-NEXT:    s_mov_b32 s10, s4
+; GFX900-NEXT:    s_mov_b32 s11, s5
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v2p0__3_1_1_1:
+; GFX90A-LABEL: s_shuffle_v4p0_v2p0__3_2_1_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:11]
+; GFX90A-NEXT:    ; def s[12:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:7]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s6
 ; GFX90A-NEXT:    s_mov_b32 s9, s7
-; GFX90A-NEXT:    s_mov_b32 s12, s10
-; GFX90A-NEXT:    s_mov_b32 s13, s11
-; GFX90A-NEXT:    s_mov_b32 s14, s10
-; GFX90A-NEXT:    s_mov_b32 s15, s11
+; GFX90A-NEXT:    s_mov_b32 s10, s4
+; GFX90A-NEXT:    s_mov_b32 s11, s5
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v2p0__3_1_1_1:
+; GFX942-LABEL: s_shuffle_v4p0_v2p0__3_2_1_1:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[8:11]
+; GFX942-NEXT:    ; def s[12:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s12, s10
-; GFX942-NEXT:    s_mov_b32 s13, s11
-; GFX942-NEXT:    s_mov_b32 s14, s10
-; GFX942-NEXT:    s_mov_b32 s15, s11
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x ptr> asm "; def $0", "=s"()
   %vec1 = call <2 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> <i32 3, i32 1, i32 1, i32 1>
+  %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> <i32 3, i32 2, i32 1, i32 1>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v2p0__3_u_1_1() {
-; GFX900-LABEL: s_shuffle_v4p0_v2p0__3_u_1_1:
+define void @s_shuffle_v4p0_v2p0__3_3_1_1() {
+; GFX900-LABEL: s_shuffle_v4p0_v2p0__3_3_1_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[12:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:7]
+; GFX900-NEXT:    ; def s[8:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s6
-; GFX900-NEXT:    s_mov_b32 s9, s7
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
 ; GFX900-NEXT:    s_mov_b32 s12, s14
 ; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -5044,17 +5649,17 @@ define void @s_shuffle_v4p0_v2p0__3_u_1_1() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v2p0__3_u_1_1:
+; GFX90A-LABEL: s_shuffle_v4p0_v2p0__3_3_1_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[12:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:7]
+; GFX90A-NEXT:    ; def s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s6
-; GFX90A-NEXT:    s_mov_b32 s9, s7
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
 ; GFX90A-NEXT:    s_mov_b32 s12, s14
 ; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -5062,228 +5667,129 @@ define void @s_shuffle_v4p0_v2p0__3_u_1_1() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v2p0__3_u_1_1:
+; GFX942-LABEL: s_shuffle_v4p0_v2p0__3_3_1_1:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[12:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:3]
+; GFX942-NEXT:    ; def s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x ptr> asm "; def $0", "=s"()
   %vec1 = call <2 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> <i32 3, i32 poison, i32 1, i32 1>
+  %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> <i32 3, i32 3, i32 1, i32 1>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v2p0__3_0_1_1() {
-; GFX900-LABEL: s_shuffle_v4p0_v2p0__3_0_1_1:
+define void @s_shuffle_v4p0_v2p0__3_3_u_1() {
+; GFX900-LABEL: s_shuffle_v4p0_v2p0__3_3_u_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:15]
+; GFX900-NEXT:    ; def s[8:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:7]
+; GFX900-NEXT:    ; def s[12:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s6
-; GFX900-NEXT:    s_mov_b32 s9, s7
-; GFX900-NEXT:    s_mov_b32 s10, s12
-; GFX900-NEXT:    s_mov_b32 s11, s13
-; GFX900-NEXT:    s_mov_b32 s12, s14
-; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v2p0__3_0_1_1:
+; GFX90A-LABEL: s_shuffle_v4p0_v2p0__3_3_u_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:15]
+; GFX90A-NEXT:    ; def s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:7]
+; GFX90A-NEXT:    ; def s[12:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s6
-; GFX90A-NEXT:    s_mov_b32 s9, s7
-; GFX90A-NEXT:    s_mov_b32 s10, s12
-; GFX90A-NEXT:    s_mov_b32 s11, s13
-; GFX90A-NEXT:    s_mov_b32 s12, s14
-; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v2p0__3_0_1_1:
+; GFX942-LABEL: s_shuffle_v4p0_v2p0__3_3_u_1:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[12:15]
+; GFX942-NEXT:    ; def s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:3]
+; GFX942-NEXT:    ; def s[12:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x ptr> asm "; def $0", "=s"()
   %vec1 = call <2 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> <i32 3, i32 0, i32 1, i32 1>
+  %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> <i32 3, i32 3, i32 poison, i32 1>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v2p0__3_2_1_1() {
-; GFX900-LABEL: s_shuffle_v4p0_v2p0__3_2_1_1:
+define void @s_shuffle_v4p0_v2p0__3_3_0_1() {
+; GFX900-LABEL: s_shuffle_v4p0_v2p0__3_3_0_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:15]
+; GFX900-NEXT:    ; def s[8:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:7]
+; GFX900-NEXT:    ; def s[12:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s6
-; GFX900-NEXT:    s_mov_b32 s9, s7
-; GFX900-NEXT:    s_mov_b32 s10, s4
-; GFX900-NEXT:    s_mov_b32 s11, s5
-; GFX900-NEXT:    s_mov_b32 s12, s14
-; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v2p0__3_2_1_1:
+; GFX90A-LABEL: s_shuffle_v4p0_v2p0__3_3_0_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:15]
+; GFX90A-NEXT:    ; def s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:7]
+; GFX90A-NEXT:    ; def s[12:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s6
-; GFX90A-NEXT:    s_mov_b32 s9, s7
-; GFX90A-NEXT:    s_mov_b32 s10, s4
-; GFX90A-NEXT:    s_mov_b32 s11, s5
-; GFX90A-NEXT:    s_mov_b32 s12, s14
-; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v2p0__3_2_1_1:
+; GFX942-LABEL: s_shuffle_v4p0_v2p0__3_3_0_1:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[12:15]
+; GFX942-NEXT:    ; def s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:3]
+; GFX942-NEXT:    ; def s[12:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <2 x ptr> asm "; def $0", "=s"()
-  %vec1 = call <2 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> <i32 3, i32 2, i32 1, i32 1>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4p0_v2p0__3_3_1_1() {
-; GFX9-LABEL: s_shuffle_v4p0_v2p0__3_3_1_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[12:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    s_mov_b32 s12, s14
-; GFX9-NEXT:    s_mov_b32 s13, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <2 x ptr> asm "; def $0", "=s"()
-  %vec1 = call <2 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> <i32 3, i32 3, i32 1, i32 1>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4p0_v2p0__3_3_u_1() {
-; GFX9-LABEL: s_shuffle_v4p0_v2p0__3_3_u_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[12:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <2 x ptr> asm "; def $0", "=s"()
-  %vec1 = call <2 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> <i32 3, i32 3, i32 poison, i32 1>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4p0_v2p0__3_3_0_1() {
-; GFX9-LABEL: s_shuffle_v4p0_v2p0__3_3_0_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[12:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x ptr> asm "; def $0", "=s"()
   %vec1 = call <2 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> <i32 3, i32 3, i32 0, i32 1>
@@ -5341,12 +5847,9 @@ define void @s_shuffle_v4p0_v2p0__3_3_2_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5445,8 +5948,7 @@ define void @s_shuffle_v4p0_v2p0__1_2_2_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5472,22 +5974,53 @@ define void @s_shuffle_v4p0_v2p0__2_2_2_2() {
 }
 
 define void @s_shuffle_v4p0_v2p0__3_2_2_2() {
-; GFX9-LABEL: s_shuffle_v4p0_v2p0__3_2_2_2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[12:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    s_mov_b32 s14, s12
-; GFX9-NEXT:    s_mov_b32 s15, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4p0_v2p0__3_2_2_2:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[12:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v2p0__3_2_2_2:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[12:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v2p0__3_2_2_2:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[12:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x ptr> asm "; def $0", "=s"()
   %vec1 = call <2 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> <i32 3, i32 2, i32 2, i32 2>
@@ -5496,41 +6029,189 @@ define void @s_shuffle_v4p0_v2p0__3_2_2_2() {
 }
 
 define void @s_shuffle_v4p0_v2p0__3_u_2_2() {
-; GFX9-LABEL: s_shuffle_v4p0_v2p0__3_u_2_2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[12:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    s_mov_b32 s14, s12
-; GFX9-NEXT:    s_mov_b32 s15, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4p0_v2p0__3_u_2_2:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[12:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v2p0__3_u_2_2:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[12:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v2p0__3_u_2_2:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[12:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <2 x ptr> asm "; def $0", "=s"()
+  %vec1 = call <2 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> <i32 3, i32 poison, i32 2, i32 2>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4p0_v2p0__3_0_2_2() {
+; GFX900-LABEL: s_shuffle_v4p0_v2p0__3_0_2_2:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[12:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:7]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s10, s4
+; GFX900-NEXT:    s_mov_b32 s11, s5
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v2p0__3_0_2_2:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[12:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:7]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s10, s4
+; GFX90A-NEXT:    s_mov_b32 s11, s5
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v2p0__3_0_2_2:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[12:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:3]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <2 x ptr> asm "; def $0", "=s"()
+  %vec1 = call <2 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> <i32 3, i32 0, i32 2, i32 2>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4p0_v2p0__3_1_2_2() {
+; GFX900-LABEL: s_shuffle_v4p0_v2p0__3_1_2_2:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[12:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v2p0__3_1_2_2:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[12:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v2p0__3_1_2_2:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[12:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x ptr> asm "; def $0", "=s"()
   %vec1 = call <2 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> <i32 3, i32 poison, i32 2, i32 2>
+  %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> <i32 3, i32 1, i32 2, i32 2>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v2p0__3_0_2_2() {
-; GFX900-LABEL: s_shuffle_v4p0_v2p0__3_0_2_2:
+define void @s_shuffle_v4p0_v2p0__3_3_2_2() {
+; GFX900-LABEL: s_shuffle_v4p0_v2p0__3_3_2_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[12:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:7]
-; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s14
 ; GFX900-NEXT:    s_mov_b32 s9, s15
-; GFX900-NEXT:    s_mov_b32 s10, s4
-; GFX900-NEXT:    s_mov_b32 s11, s5
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
 ; GFX900-NEXT:    s_mov_b32 s14, s12
 ; GFX900-NEXT:    s_mov_b32 s15, s13
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -5538,19 +6219,16 @@ define void @s_shuffle_v4p0_v2p0__3_0_2_2() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v2p0__3_0_2_2:
+; GFX90A-LABEL: s_shuffle_v4p0_v2p0__3_3_2_2:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[12:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:7]
-; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s14
 ; GFX90A-NEXT:    s_mov_b32 s9, s15
-; GFX90A-NEXT:    s_mov_b32 s10, s4
-; GFX90A-NEXT:    s_mov_b32 s11, s5
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
 ; GFX90A-NEXT:    s_mov_b32 s14, s12
 ; GFX90A-NEXT:    s_mov_b32 s15, s13
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -5558,74 +6236,19 @@ define void @s_shuffle_v4p0_v2p0__3_0_2_2() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v2p0__3_0_2_2:
+; GFX942-LABEL: s_shuffle_v4p0_v2p0__3_3_2_2:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[12:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:3]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s14, s12
-; GFX942-NEXT:    s_mov_b32 s15, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <2 x ptr> asm "; def $0", "=s"()
-  %vec1 = call <2 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> <i32 3, i32 0, i32 2, i32 2>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4p0_v2p0__3_1_2_2() {
-; GFX9-LABEL: s_shuffle_v4p0_v2p0__3_1_2_2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[12:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    s_mov_b32 s14, s12
-; GFX9-NEXT:    s_mov_b32 s15, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <2 x ptr> asm "; def $0", "=s"()
-  %vec1 = call <2 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> <i32 3, i32 1, i32 2, i32 2>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4p0_v2p0__3_3_2_2() {
-; GFX9-LABEL: s_shuffle_v4p0_v2p0__3_3_2_2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[12:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    s_mov_b32 s10, s14
-; GFX9-NEXT:    s_mov_b32 s11, s15
-; GFX9-NEXT:    s_mov_b32 s14, s12
-; GFX9-NEXT:    s_mov_b32 s15, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x ptr> asm "; def $0", "=s"()
   %vec1 = call <2 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> <i32 3, i32 3, i32 2, i32 2>
@@ -5674,12 +6297,9 @@ define void @s_shuffle_v4p0_v2p0__3_3_u_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s2
-; GFX942-NEXT:    s_mov_b32 s11, s3
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5741,12 +6361,9 @@ define void @s_shuffle_v4p0_v2p0__3_3_0_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s2
-; GFX942-NEXT:    s_mov_b32 s11, s3
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5812,14 +6429,10 @@ define void @s_shuffle_v4p0_v2p0__3_3_1_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5832,20 +6445,48 @@ define void @s_shuffle_v4p0_v2p0__3_3_1_2() {
 }
 
 define void @s_shuffle_v4p0_v2p0__u_3_3_3() {
-; GFX9-LABEL: s_shuffle_v4p0_v2p0__u_3_3_3:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    s_mov_b32 s14, s10
-; GFX9-NEXT:    s_mov_b32 s15, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4p0_v2p0__u_3_3_3:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    s_mov_b32 s14, s10
+; GFX900-NEXT:    s_mov_b32 s15, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v2p0__u_3_3_3:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s10
+; GFX90A-NEXT:    s_mov_b32 s15, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v2p0__u_3_3_3:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x ptr> asm "; def $0", "=s"()
   %vec1 = call <2 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> <i32 poison, i32 3, i32 3, i32 3>
@@ -5854,23 +6495,57 @@ define void @s_shuffle_v4p0_v2p0__u_3_3_3() {
 }
 
 define void @s_shuffle_v4p0_v2p0__0_3_3_3() {
-; GFX9-LABEL: s_shuffle_v4p0_v2p0__0_3_3_3:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[12:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s14
-; GFX9-NEXT:    s_mov_b32 s11, s15
-; GFX9-NEXT:    s_mov_b32 s12, s14
-; GFX9-NEXT:    s_mov_b32 s13, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4p0_v2p0__0_3_3_3:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[12:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v2p0__0_3_3_3:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[12:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v2p0__0_3_3_3:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[12:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x ptr> asm "; def $0", "=s"()
   %vec1 = call <2 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> <i32 0, i32 3, i32 3, i32 3>
@@ -5928,12 +6603,9 @@ define void @s_shuffle_v4p0_v2p0__1_3_3_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s12, s10
-; GFX942-NEXT:    s_mov_b32 s13, s11
-; GFX942-NEXT:    s_mov_b32 s14, s10
-; GFX942-NEXT:    s_mov_b32 s15, s11
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5946,20 +6618,48 @@ define void @s_shuffle_v4p0_v2p0__1_3_3_3() {
 }
 
 define void @s_shuffle_v4p0_v2p0__2_3_3_3() {
-; GFX9-LABEL: s_shuffle_v4p0_v2p0__2_3_3_3:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    s_mov_b32 s14, s10
-; GFX9-NEXT:    s_mov_b32 s15, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4p0_v2p0__2_3_3_3:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    s_mov_b32 s14, s10
+; GFX900-NEXT:    s_mov_b32 s15, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v2p0__2_3_3_3:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s10
+; GFX90A-NEXT:    s_mov_b32 s15, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v2p0__2_3_3_3:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x ptr> asm "; def $0", "=s"()
   %vec1 = call <2 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> <i32 2, i32 3, i32 3, i32 3>
@@ -5968,20 +6668,48 @@ define void @s_shuffle_v4p0_v2p0__2_3_3_3() {
 }
 
 define void @s_shuffle_v4p0_v2p0__3_u_3_3() {
-; GFX9-LABEL: s_shuffle_v4p0_v2p0__3_u_3_3:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[12:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    s_mov_b32 s12, s14
-; GFX9-NEXT:    s_mov_b32 s13, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4p0_v2p0__3_u_3_3:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[12:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v2p0__3_u_3_3:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[12:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v2p0__3_u_3_3:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[12:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x ptr> asm "; def $0", "=s"()
   %vec1 = call <2 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> <i32 3, i32 poison, i32 3, i32 3>
@@ -6039,12 +6767,9 @@ define void @s_shuffle_v4p0_v2p0__3_0_3_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -6057,23 +6782,57 @@ define void @s_shuffle_v4p0_v2p0__3_0_3_3() {
 }
 
 define void @s_shuffle_v4p0_v2p0__3_1_3_3() {
-; GFX9-LABEL: s_shuffle_v4p0_v2p0__3_1_3_3:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[12:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    s_mov_b32 s12, s14
-; GFX9-NEXT:    s_mov_b32 s13, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4p0_v2p0__3_1_3_3:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[12:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v2p0__3_1_3_3:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[12:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v2p0__3_1_3_3:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[12:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x ptr> asm "; def $0", "=s"()
   %vec1 = call <2 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> <i32 3, i32 1, i32 3, i32 3>
@@ -6082,22 +6841,53 @@ define void @s_shuffle_v4p0_v2p0__3_1_3_3() {
 }
 
 define void @s_shuffle_v4p0_v2p0__3_2_3_3() {
-; GFX9-LABEL: s_shuffle_v4p0_v2p0__3_2_3_3:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[12:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    s_mov_b32 s12, s14
-; GFX9-NEXT:    s_mov_b32 s13, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4p0_v2p0__3_2_3_3:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[12:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v2p0__3_2_3_3:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[12:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v2p0__3_2_3_3:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[12:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x ptr> asm "; def $0", "=s"()
   %vec1 = call <2 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> <i32 3, i32 2, i32 3, i32 3>
@@ -6106,20 +6896,48 @@ define void @s_shuffle_v4p0_v2p0__3_2_3_3() {
 }
 
 define void @s_shuffle_v4p0_v2p0__3_3_u_3() {
-; GFX9-LABEL: s_shuffle_v4p0_v2p0__3_3_u_3:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    s_mov_b32 s14, s10
-; GFX9-NEXT:    s_mov_b32 s15, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4p0_v2p0__3_3_u_3:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s14, s10
+; GFX900-NEXT:    s_mov_b32 s15, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v2p0__3_3_u_3:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s10
+; GFX90A-NEXT:    s_mov_b32 s15, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v2p0__3_3_u_3:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x ptr> asm "; def $0", "=s"()
   %vec1 = call <2 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> <i32 3, i32 3, i32 poison, i32 3>
@@ -6128,23 +6946,57 @@ define void @s_shuffle_v4p0_v2p0__3_3_u_3() {
 }
 
 define void @s_shuffle_v4p0_v2p0__3_3_0_3() {
-; GFX9-LABEL: s_shuffle_v4p0_v2p0__3_3_0_3:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[12:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    s_mov_b32 s14, s10
-; GFX9-NEXT:    s_mov_b32 s15, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4p0_v2p0__3_3_0_3:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[12:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s14, s10
+; GFX900-NEXT:    s_mov_b32 s15, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v2p0__3_3_0_3:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[12:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s10
+; GFX90A-NEXT:    s_mov_b32 s15, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v2p0__3_3_0_3:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[12:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x ptr> asm "; def $0", "=s"()
   %vec1 = call <2 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> <i32 3, i32 3, i32 0, i32 3>
@@ -6202,12 +7054,9 @@ define void @s_shuffle_v4p0_v2p0__3_3_1_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s10
-; GFX942-NEXT:    s_mov_b32 s15, s11
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -6220,20 +7069,48 @@ define void @s_shuffle_v4p0_v2p0__3_3_1_3() {
 }
 
 define void @s_shuffle_v4p0_v2p0__3_3_2_3() {
-; GFX9-LABEL: s_shuffle_v4p0_v2p0__3_3_2_3:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[12:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    s_mov_b32 s10, s14
-; GFX9-NEXT:    s_mov_b32 s11, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4p0_v2p0__3_3_2_3:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[12:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v2p0__3_3_2_3:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[12:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v2p0__3_3_2_3:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[12:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x ptr> asm "; def $0", "=s"()
   %vec1 = call <2 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> <i32 3, i32 3, i32 2, i32 3>
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v3p0.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v3p0.ll
index e6ac554735eee..93b0a34460a49 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v3p0.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v3p0.ll
@@ -87,6 +87,7 @@ define void @v_shuffle_v4p0_v3p0__1_u_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -126,10 +127,12 @@ define void @v_shuffle_v4p0_v3p0__2_u_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v4p0_v3p0__2_u_u_u:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1] offset:16
+; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX942-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1]
@@ -182,6 +185,7 @@ define void @v_shuffle_v4p0_v3p0__4_u_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -222,10 +226,12 @@ define void @v_shuffle_v4p0_v3p0__5_u_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v4p0_v3p0__5_u_u_u:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1] offset:16
+; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX942-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1]
@@ -285,11 +291,11 @@ define void @v_shuffle_v4p0_v3p0__5_0_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[2:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v6
-; GFX942-NEXT:    v_mov_b32_e32 v3, v7
+; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1] offset:16
 ; GFX942-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX942-NEXT:    v_mov_b32_e32 v5, v1
+; GFX942-NEXT:    v_mov_b32_e32 v2, v6
+; GFX942-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -336,14 +342,15 @@ define void @v_shuffle_v4p0_v3p0__5_1_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v4p0_v3p0__5_1_u_u:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v10, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_mov_b32_e32 v10, 0
+; GFX942-NEXT:    global_store_dwordx4 v10, v[0:3], s[0:1] offset:16
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[4:9]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_mov_b32_e32 v0, v8
 ; GFX942-NEXT:    v_mov_b32_e32 v1, v9
 ; GFX942-NEXT:    global_store_dwordx4 v10, v[0:3], s[0:1]
@@ -392,14 +399,15 @@ define void @v_shuffle_v4p0_v3p0__5_2_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v4p0_v3p0__5_2_u_u:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v12, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_mov_b32_e32 v12, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[6:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    global_store_dwordx4 v12, v[0:3], s[0:1] offset:16
+; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_mov_b32_e32 v2, v10
 ; GFX942-NEXT:    v_mov_b32_e32 v3, v11
 ; GFX942-NEXT:    global_store_dwordx4 v12, v[2:5], s[0:1]
@@ -446,10 +454,12 @@ define void @v_shuffle_v4p0_v3p0__5_3_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v4p0_v3p0__5_3_u_u:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1] offset:16
+; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX942-NEXT:    v_mov_b32_e32 v3, v5
 ; GFX942-NEXT:    v_mov_b32_e32 v4, v0
@@ -494,10 +504,12 @@ define void @v_shuffle_v4p0_v3p0__5_4_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v4p0_v3p0__5_4_u_u:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1] offset:16
+; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX942-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1]
@@ -540,10 +552,12 @@ define void @v_shuffle_v4p0_v3p0__5_5_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v4p0_v3p0__5_5_u_u:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1] offset:16
+; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX942-NEXT:    v_mov_b32_e32 v3, v5
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[2:5], s[0:1]
@@ -4699,6 +4713,7 @@ define void @v_shuffle_v4p0_v3p0__1_3_3_3(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -4738,10 +4753,12 @@ define void @v_shuffle_v4p0_v3p0__2_3_3_3(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v4p0_v3p0__2_3_3_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1] offset:16
+; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX942-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1]
@@ -7818,8 +7835,7 @@ define void @s_shuffle_v4p0_v3p0__1_u_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -7859,8 +7875,7 @@ define void @s_shuffle_v4p0_v3p0__2_u_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -7918,8 +7933,7 @@ define void @s_shuffle_v4p0_v3p0__4_u_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -7960,8 +7974,7 @@ define void @s_shuffle_v4p0_v3p0__5_u_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8016,11 +8029,11 @@ define void @s_shuffle_v4p0_v3p0__5_0_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s0
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:9]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8070,8 +8083,7 @@ define void @s_shuffle_v4p0_v3p0__5_1_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8125,10 +8137,8 @@ define void @s_shuffle_v4p0_v3p0__5_2_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8173,10 +8183,8 @@ define void @s_shuffle_v4p0_v3p0__5_3_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8189,18 +8197,43 @@ define void @s_shuffle_v4p0_v3p0__5_3_u_u() {
 }
 
 define void @s_shuffle_v4p0_v3p0__5_4_u_u() {
-; GFX9-LABEL: s_shuffle_v4p0_v3p0__5_4_u_u:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s12
-; GFX9-NEXT:    s_mov_b32 s9, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_4_u_u:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s12
+; GFX900-NEXT:    s_mov_b32 s9, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_4_u_u:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s12
+; GFX90A-NEXT:    s_mov_b32 s9, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v3p0__5_4_u_u:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x ptr> asm "; def $0", "=s"()
   %vec1 = call <3 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> <i32 5, i32 4, i32 poison, i32 poison>
@@ -8245,10 +8278,8 @@ define void @s_shuffle_v4p0_v3p0__5_5_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8310,12 +8341,9 @@ define void @s_shuffle_v4p0_v3p0__5_5_0_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8377,12 +8405,9 @@ define void @s_shuffle_v4p0_v3p0__5_5_1_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8440,10 +8465,8 @@ define void @s_shuffle_v4p0_v3p0__5_5_2_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8492,12 +8515,9 @@ define void @s_shuffle_v4p0_v3p0__5_5_3_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8550,12 +8570,9 @@ define void @s_shuffle_v4p0_v3p0__5_5_4_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8568,20 +8585,48 @@ define void @s_shuffle_v4p0_v3p0__5_5_4_u() {
 }
 
 define void @s_shuffle_v4p0_v3p0__5_5_5_u() {
-; GFX9-LABEL: s_shuffle_v4p0_v3p0__5_5_5_u:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s12
-; GFX9-NEXT:    s_mov_b32 s9, s13
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_5_5_u:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s12
+; GFX900-NEXT:    s_mov_b32 s9, s13
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_5_5_u:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s12
+; GFX90A-NEXT:    s_mov_b32 s9, s13
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v3p0__5_5_5_u:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x ptr> asm "; def $0", "=s"()
   %vec1 = call <3 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> <i32 5, i32 5, i32 5, i32 poison>
@@ -8639,12 +8684,9 @@ define void @s_shuffle_v4p0_v3p0__5_5_5_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8706,12 +8748,9 @@ define void @s_shuffle_v4p0_v3p0__5_5_5_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8773,12 +8812,9 @@ define void @s_shuffle_v4p0_v3p0__5_5_5_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8835,14 +8871,10 @@ define void @s_shuffle_v4p0_v3p0__5_5_5_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s4
-; GFX942-NEXT:    s_mov_b32 s13, s5
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8895,14 +8927,10 @@ define void @s_shuffle_v4p0_v3p0__5_5_5_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s4
-; GFX942-NEXT:    s_mov_b32 s13, s5
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8915,22 +8943,53 @@ define void @s_shuffle_v4p0_v3p0__5_5_5_4() {
 }
 
 define void @s_shuffle_v4p0_v3p0__5_5_5_5() {
-; GFX9-LABEL: s_shuffle_v4p0_v3p0__5_5_5_5:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s12
-; GFX9-NEXT:    s_mov_b32 s9, s13
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    s_mov_b32 s14, s12
-; GFX9-NEXT:    s_mov_b32 s15, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_5_5_5:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s12
+; GFX900-NEXT:    s_mov_b32 s9, s13
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_5_5_5:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s12
+; GFX90A-NEXT:    s_mov_b32 s9, s13
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v3p0__5_5_5_5:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x ptr> asm "; def $0", "=s"()
   %vec1 = call <3 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> <i32 5, i32 5, i32 5, i32 5>
@@ -8979,12 +9038,9 @@ define void @s_shuffle_v4p0_v3p0__u_0_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8996,22 +9052,53 @@ define void @s_shuffle_v4p0_v3p0__u_0_0_0() {
 }
 
 define void @s_shuffle_v4p0_v3p0__0_0_0_0() {
-; GFX9-LABEL: s_shuffle_v4p0_v3p0__0_0_0_0:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s8
-; GFX9-NEXT:    s_mov_b32 s11, s9
-; GFX9-NEXT:    s_mov_b32 s12, s8
-; GFX9-NEXT:    s_mov_b32 s13, s9
-; GFX9-NEXT:    s_mov_b32 s14, s8
-; GFX9-NEXT:    s_mov_b32 s15, s9
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4p0_v3p0__0_0_0_0:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s8
+; GFX900-NEXT:    s_mov_b32 s11, s9
+; GFX900-NEXT:    s_mov_b32 s12, s8
+; GFX900-NEXT:    s_mov_b32 s13, s9
+; GFX900-NEXT:    s_mov_b32 s14, s8
+; GFX900-NEXT:    s_mov_b32 s15, s9
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v3p0__0_0_0_0:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s8
+; GFX90A-NEXT:    s_mov_b32 s11, s9
+; GFX90A-NEXT:    s_mov_b32 s12, s8
+; GFX90A-NEXT:    s_mov_b32 s13, s9
+; GFX90A-NEXT:    s_mov_b32 s14, s8
+; GFX90A-NEXT:    s_mov_b32 s15, s9
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v3p0__0_0_0_0:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[8:9]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[8:9]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[8:9]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> zeroinitializer
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
@@ -9063,14 +9150,10 @@ define void @s_shuffle_v4p0_v3p0__1_0_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -9122,14 +9205,10 @@ define void @s_shuffle_v4p0_v3p0__2_0_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -9181,12 +9260,9 @@ define void @s_shuffle_v4p0_v3p0__3_0_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -9248,17 +9324,13 @@ define void @s_shuffle_v4p0_v3p0__4_0_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s0
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:9]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -9321,15 +9393,12 @@ define void @s_shuffle_v4p0_v3p0__5_0_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s0
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:9]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -9388,13 +9457,11 @@ define void @s_shuffle_v4p0_v3p0__5_u_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s0
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:9]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -9457,15 +9524,12 @@ define void @s_shuffle_v4p0_v3p0__5_1_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s2
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:9]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s11, s3
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -9527,14 +9591,10 @@ define void @s_shuffle_v4p0_v3p0__5_2_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -9597,15 +9657,12 @@ define void @s_shuffle_v4p0_v3p0__5_3_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s0
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:9]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -9667,12 +9724,9 @@ define void @s_shuffle_v4p0_v3p0__5_4_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -9738,14 +9792,10 @@ define void @s_shuffle_v4p0_v3p0__5_5_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -9807,12 +9857,9 @@ define void @s_shuffle_v4p0_v3p0__5_5_u_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -9878,14 +9925,10 @@ define void @s_shuffle_v4p0_v3p0__5_5_1_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -9951,14 +9994,10 @@ define void @s_shuffle_v4p0_v3p0__5_5_2_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s12, s4
-; GFX942-NEXT:    s_mov_b32 s13, s5
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -10020,12 +10059,9 @@ define void @s_shuffle_v4p0_v3p0__5_5_3_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s16
-; GFX942-NEXT:    s_mov_b32 s9, s17
-; GFX942-NEXT:    s_mov_b32 s10, s16
-; GFX942-NEXT:    s_mov_b32 s11, s17
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[16:17]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[16:17]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -10091,14 +10127,10 @@ define void @s_shuffle_v4p0_v3p0__5_5_4_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s16
-; GFX942-NEXT:    s_mov_b32 s9, s17
-; GFX942-NEXT:    s_mov_b32 s10, s16
-; GFX942-NEXT:    s_mov_b32 s11, s17
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[16:17]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[16:17]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -10111,87 +10143,205 @@ define void @s_shuffle_v4p0_v3p0__5_5_4_0() {
 }
 
 define void @s_shuffle_v4p0_v3p0__u_1_1_1() {
-; GFX9-LABEL: s_shuffle_v4p0_v3p0__u_1_1_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    s_mov_b32 s14, s10
-; GFX9-NEXT:    s_mov_b32 s15, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <3 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> <i32 poison, i32 1, i32 1, i32 1>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4p0_v3p0__0_1_1_1() {
-; GFX9-LABEL: s_shuffle_v4p0_v3p0__0_1_1_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    s_mov_b32 s14, s10
-; GFX9-NEXT:    s_mov_b32 s15, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <3 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> <i32 0, i32 1, i32 1, i32 1>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4p0_v3p0__1_1_1_1() {
-; GFX9-LABEL: s_shuffle_v4p0_v3p0__1_1_1_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    s_mov_b32 s14, s10
-; GFX9-NEXT:    s_mov_b32 s15, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <3 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+; GFX900-LABEL: s_shuffle_v4p0_v3p0__u_1_1_1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    s_mov_b32 s14, s10
+; GFX900-NEXT:    s_mov_b32 s15, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v3p0__u_1_1_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s10
+; GFX90A-NEXT:    s_mov_b32 s15, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v3p0__u_1_1_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <3 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> <i32 poison, i32 1, i32 1, i32 1>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4p0_v3p0__0_1_1_1() {
+; GFX900-LABEL: s_shuffle_v4p0_v3p0__0_1_1_1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    s_mov_b32 s14, s10
+; GFX900-NEXT:    s_mov_b32 s15, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v3p0__0_1_1_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s10
+; GFX90A-NEXT:    s_mov_b32 s15, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v3p0__0_1_1_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <3 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> <i32 0, i32 1, i32 1, i32 1>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4p0_v3p0__1_1_1_1() {
+; GFX900-LABEL: s_shuffle_v4p0_v3p0__1_1_1_1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    s_mov_b32 s14, s10
+; GFX900-NEXT:    s_mov_b32 s15, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v3p0__1_1_1_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s10
+; GFX90A-NEXT:    s_mov_b32 s15, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v3p0__1_1_1_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <3 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
 define void @s_shuffle_v4p0_v3p0__2_1_1_1() {
-; GFX9-LABEL: s_shuffle_v4p0_v3p0__2_1_1_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s12
-; GFX9-NEXT:    s_mov_b32 s9, s13
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    s_mov_b32 s14, s10
-; GFX9-NEXT:    s_mov_b32 s15, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4p0_v3p0__2_1_1_1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s12
+; GFX900-NEXT:    s_mov_b32 s9, s13
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    s_mov_b32 s14, s10
+; GFX900-NEXT:    s_mov_b32 s15, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v3p0__2_1_1_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s12
+; GFX90A-NEXT:    s_mov_b32 s9, s13
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s10
+; GFX90A-NEXT:    s_mov_b32 s15, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v3p0__2_1_1_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> <i32 2, i32 1, i32 1, i32 1>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
@@ -10199,20 +10349,48 @@ define void @s_shuffle_v4p0_v3p0__2_1_1_1() {
 }
 
 define void @s_shuffle_v4p0_v3p0__3_1_1_1() {
-; GFX9-LABEL: s_shuffle_v4p0_v3p0__3_1_1_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    s_mov_b32 s14, s10
-; GFX9-NEXT:    s_mov_b32 s15, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4p0_v3p0__3_1_1_1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    s_mov_b32 s14, s10
+; GFX900-NEXT:    s_mov_b32 s15, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v3p0__3_1_1_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s10
+; GFX90A-NEXT:    s_mov_b32 s15, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v3p0__3_1_1_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> <i32 3, i32 1, i32 1, i32 1>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
@@ -10269,12 +10447,9 @@ define void @s_shuffle_v4p0_v3p0__4_1_1_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s12, s10
-; GFX942-NEXT:    s_mov_b32 s13, s11
-; GFX942-NEXT:    s_mov_b32 s14, s10
-; GFX942-NEXT:    s_mov_b32 s15, s11
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -10332,12 +10507,9 @@ define void @s_shuffle_v4p0_v3p0__5_1_1_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s12, s10
-; GFX942-NEXT:    s_mov_b32 s13, s11
-; GFX942-NEXT:    s_mov_b32 s14, s10
-; GFX942-NEXT:    s_mov_b32 s15, s11
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -10396,13 +10568,11 @@ define void @s_shuffle_v4p0_v3p0__5_u_1_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s2
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:9]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -10465,15 +10635,12 @@ define void @s_shuffle_v4p0_v3p0__5_0_1_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s0
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:9]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -10531,14 +10698,10 @@ define void @s_shuffle_v4p0_v3p0__5_2_1_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -10601,15 +10764,12 @@ define void @s_shuffle_v4p0_v3p0__5_3_1_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s2
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:9]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -10671,12 +10831,9 @@ define void @s_shuffle_v4p0_v3p0__5_4_1_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -10742,14 +10899,10 @@ define void @s_shuffle_v4p0_v3p0__5_5_1_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -10811,12 +10964,9 @@ define void @s_shuffle_v4p0_v3p0__5_5_u_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -10882,14 +11032,10 @@ define void @s_shuffle_v4p0_v3p0__5_5_0_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -10951,14 +11097,10 @@ define void @s_shuffle_v4p0_v3p0__5_5_2_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s12, s4
-; GFX942-NEXT:    s_mov_b32 s13, s5
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -11020,12 +11162,9 @@ define void @s_shuffle_v4p0_v3p0__5_5_3_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s16
-; GFX942-NEXT:    s_mov_b32 s9, s17
-; GFX942-NEXT:    s_mov_b32 s10, s16
-; GFX942-NEXT:    s_mov_b32 s11, s17
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[16:17]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[16:17]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -11091,14 +11230,10 @@ define void @s_shuffle_v4p0_v3p0__5_5_4_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s16
-; GFX942-NEXT:    s_mov_b32 s9, s17
-; GFX942-NEXT:    s_mov_b32 s10, s16
-; GFX942-NEXT:    s_mov_b32 s11, s17
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[16:17]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[16:17]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -11111,20 +11246,48 @@ define void @s_shuffle_v4p0_v3p0__5_5_4_1() {
 }
 
 define void @s_shuffle_v4p0_v3p0__u_2_2_2() {
-; GFX9-LABEL: s_shuffle_v4p0_v3p0__u_2_2_2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    s_mov_b32 s14, s12
-; GFX9-NEXT:    s_mov_b32 s15, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4p0_v3p0__u_2_2_2:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v3p0__u_2_2_2:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v3p0__u_2_2_2:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> <i32 poison, i32 2, i32 2, i32 2>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
@@ -11132,20 +11295,48 @@ define void @s_shuffle_v4p0_v3p0__u_2_2_2() {
 }
 
 define void @s_shuffle_v4p0_v3p0__0_2_2_2() {
-; GFX9-LABEL: s_shuffle_v4p0_v3p0__0_2_2_2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    s_mov_b32 s14, s12
-; GFX9-NEXT:    s_mov_b32 s15, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4p0_v3p0__0_2_2_2:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v3p0__0_2_2_2:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v3p0__0_2_2_2:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> <i32 0, i32 2, i32 2, i32 2>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
@@ -11153,22 +11344,53 @@ define void @s_shuffle_v4p0_v3p0__0_2_2_2() {
 }
 
 define void @s_shuffle_v4p0_v3p0__1_2_2_2() {
-; GFX9-LABEL: s_shuffle_v4p0_v3p0__1_2_2_2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    s_mov_b32 s14, s12
-; GFX9-NEXT:    s_mov_b32 s15, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4p0_v3p0__1_2_2_2:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v3p0__1_2_2_2:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v3p0__1_2_2_2:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> <i32 1, i32 2, i32 2, i32 2>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
@@ -11176,22 +11398,53 @@ define void @s_shuffle_v4p0_v3p0__1_2_2_2() {
 }
 
 define void @s_shuffle_v4p0_v3p0__2_2_2_2() {
-; GFX9-LABEL: s_shuffle_v4p0_v3p0__2_2_2_2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s12
-; GFX9-NEXT:    s_mov_b32 s9, s13
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    s_mov_b32 s14, s12
-; GFX9-NEXT:    s_mov_b32 s15, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4p0_v3p0__2_2_2_2:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s12
+; GFX900-NEXT:    s_mov_b32 s9, s13
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v3p0__2_2_2_2:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s12
+; GFX90A-NEXT:    s_mov_b32 s9, s13
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v3p0__2_2_2_2:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
@@ -11199,20 +11452,48 @@ define void @s_shuffle_v4p0_v3p0__2_2_2_2() {
 }
 
 define void @s_shuffle_v4p0_v3p0__3_2_2_2() {
-; GFX9-LABEL: s_shuffle_v4p0_v3p0__3_2_2_2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    s_mov_b32 s14, s12
-; GFX9-NEXT:    s_mov_b32 s15, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4p0_v3p0__3_2_2_2:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v3p0__3_2_2_2:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v3p0__3_2_2_2:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> <i32 3, i32 2, i32 2, i32 2>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
@@ -11269,12 +11550,9 @@ define void @s_shuffle_v4p0_v3p0__4_2_2_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s14, s12
-; GFX942-NEXT:    s_mov_b32 s15, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -11332,12 +11610,9 @@ define void @s_shuffle_v4p0_v3p0__5_2_2_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s14, s12
-; GFX942-NEXT:    s_mov_b32 s15, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -11391,10 +11666,8 @@ define void @s_shuffle_v4p0_v3p0__5_u_2_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s14, s12
-; GFX942-NEXT:    s_mov_b32 s15, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -11456,14 +11729,10 @@ define void @s_shuffle_v4p0_v3p0__5_0_2_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s4
-; GFX942-NEXT:    s_mov_b32 s13, s5
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -11517,10 +11786,8 @@ define void @s_shuffle_v4p0_v3p0__5_1_2_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s14, s12
-; GFX942-NEXT:    s_mov_b32 s15, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -11578,12 +11845,9 @@ define void @s_shuffle_v4p0_v3p0__5_3_2_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s14, s12
-; GFX942-NEXT:    s_mov_b32 s15, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -11645,12 +11909,9 @@ define void @s_shuffle_v4p0_v3p0__5_4_2_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s12, s4
-; GFX942-NEXT:    s_mov_b32 s13, s5
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -11712,12 +11973,9 @@ define void @s_shuffle_v4p0_v3p0__5_5_2_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s14, s12
-; GFX942-NEXT:    s_mov_b32 s15, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -11779,12 +12037,9 @@ define void @s_shuffle_v4p0_v3p0__5_5_u_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -11846,14 +12101,10 @@ define void @s_shuffle_v4p0_v3p0__5_5_0_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -11919,14 +12170,10 @@ define void @s_shuffle_v4p0_v3p0__5_5_1_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -11992,12 +12239,9 @@ define void @s_shuffle_v4p0_v3p0__5_5_3_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s16
-; GFX942-NEXT:    s_mov_b32 s9, s17
-; GFX942-NEXT:    s_mov_b32 s10, s16
-; GFX942-NEXT:    s_mov_b32 s11, s17
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[16:17]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[16:17]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -12063,14 +12307,10 @@ define void @s_shuffle_v4p0_v3p0__5_5_4_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s16
-; GFX942-NEXT:    s_mov_b32 s9, s17
-; GFX942-NEXT:    s_mov_b32 s10, s16
-; GFX942-NEXT:    s_mov_b32 s11, s17
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[16:17]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[16:17]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -12169,8 +12409,7 @@ define void @s_shuffle_v4p0_v3p0__1_3_3_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -12210,8 +12449,7 @@ define void @s_shuffle_v4p0_v3p0__2_3_3_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -12281,14 +12519,10 @@ define void @s_shuffle_v4p0_v3p0__4_3_3_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -12341,14 +12575,10 @@ define void @s_shuffle_v4p0_v3p0__5_3_3_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -12397,12 +12627,9 @@ define void @s_shuffle_v4p0_v3p0__5_u_3_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -12465,15 +12692,12 @@ define void @s_shuffle_v4p0_v3p0__5_0_3_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s0
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:9]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s4
-; GFX942-NEXT:    s_mov_b32 s13, s5
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -12531,12 +12755,9 @@ define void @s_shuffle_v4p0_v3p0__5_1_3_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -12598,14 +12819,10 @@ define void @s_shuffle_v4p0_v3p0__5_2_3_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[16:21]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s20
-; GFX942-NEXT:    s_mov_b32 s9, s21
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s16
-; GFX942-NEXT:    s_mov_b32 s13, s17
-; GFX942-NEXT:    s_mov_b32 s14, s16
-; GFX942-NEXT:    s_mov_b32 s15, s17
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[20:21]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[16:17]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[16:17]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -12658,14 +12875,10 @@ define void @s_shuffle_v4p0_v3p0__5_4_3_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s2
-; GFX942-NEXT:    s_mov_b32 s11, s3
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -12722,14 +12935,10 @@ define void @s_shuffle_v4p0_v3p0__5_5_3_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -12782,12 +12991,9 @@ define void @s_shuffle_v4p0_v3p0__5_5_u_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -12853,14 +13059,10 @@ define void @s_shuffle_v4p0_v3p0__5_5_0_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[16:21]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s20
-; GFX942-NEXT:    s_mov_b32 s9, s21
-; GFX942-NEXT:    s_mov_b32 s10, s20
-; GFX942-NEXT:    s_mov_b32 s11, s21
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s16
-; GFX942-NEXT:    s_mov_b32 s15, s17
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[20:21]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[20:21]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[16:17]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -12926,14 +13128,10 @@ define void @s_shuffle_v4p0_v3p0__5_5_1_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[16:21]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s20
-; GFX942-NEXT:    s_mov_b32 s9, s21
-; GFX942-NEXT:    s_mov_b32 s10, s20
-; GFX942-NEXT:    s_mov_b32 s11, s21
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s16
-; GFX942-NEXT:    s_mov_b32 s15, s17
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[20:21]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[20:21]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[16:17]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -12995,12 +13193,9 @@ define void @s_shuffle_v4p0_v3p0__5_5_2_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -13057,14 +13252,10 @@ define void @s_shuffle_v4p0_v3p0__5_5_4_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -13077,20 +13268,48 @@ define void @s_shuffle_v4p0_v3p0__5_5_4_3() {
 }
 
 define void @s_shuffle_v4p0_v3p0__u_4_4_4() {
-; GFX9-LABEL: s_shuffle_v4p0_v3p0__u_4_4_4:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    s_mov_b32 s14, s10
-; GFX9-NEXT:    s_mov_b32 s15, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4p0_v3p0__u_4_4_4:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    s_mov_b32 s14, s10
+; GFX900-NEXT:    s_mov_b32 s15, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v3p0__u_4_4_4:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s10
+; GFX90A-NEXT:    s_mov_b32 s15, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v3p0__u_4_4_4:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x ptr> asm "; def $0", "=s"()
   %vec1 = call <3 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> <i32 poison, i32 4, i32 4, i32 4>
@@ -13144,12 +13363,9 @@ define void @s_shuffle_v4p0_v3p0__0_4_4_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s2
-; GFX942-NEXT:    s_mov_b32 s11, s3
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -13211,12 +13427,9 @@ define void @s_shuffle_v4p0_v3p0__1_4_4_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s12, s10
-; GFX942-NEXT:    s_mov_b32 s13, s11
-; GFX942-NEXT:    s_mov_b32 s14, s10
-; GFX942-NEXT:    s_mov_b32 s15, s11
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -13278,12 +13491,9 @@ define void @s_shuffle_v4p0_v3p0__2_4_4_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s12, s10
-; GFX942-NEXT:    s_mov_b32 s13, s11
-; GFX942-NEXT:    s_mov_b32 s14, s10
-; GFX942-NEXT:    s_mov_b32 s15, s11
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -13296,20 +13506,48 @@ define void @s_shuffle_v4p0_v3p0__2_4_4_4() {
 }
 
 define void @s_shuffle_v4p0_v3p0__3_4_4_4() {
-; GFX9-LABEL: s_shuffle_v4p0_v3p0__3_4_4_4:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    s_mov_b32 s14, s10
-; GFX9-NEXT:    s_mov_b32 s15, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4p0_v3p0__3_4_4_4:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    s_mov_b32 s14, s10
+; GFX900-NEXT:    s_mov_b32 s15, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v3p0__3_4_4_4:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s10
+; GFX90A-NEXT:    s_mov_b32 s15, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v3p0__3_4_4_4:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x ptr> asm "; def $0", "=s"()
   %vec1 = call <3 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> <i32 3, i32 4, i32 4, i32 4>
@@ -13318,22 +13556,53 @@ define void @s_shuffle_v4p0_v3p0__3_4_4_4() {
 }
 
 define void @s_shuffle_v4p0_v3p0__4_4_4_4() {
-; GFX9-LABEL: s_shuffle_v4p0_v3p0__4_4_4_4:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    s_mov_b32 s14, s10
-; GFX9-NEXT:    s_mov_b32 s15, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4p0_v3p0__4_4_4_4:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    s_mov_b32 s14, s10
+; GFX900-NEXT:    s_mov_b32 s15, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v3p0__4_4_4_4:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s10
+; GFX90A-NEXT:    s_mov_b32 s15, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v3p0__4_4_4_4:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x ptr> asm "; def $0", "=s"()
   %vec1 = call <3 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> <i32 4, i32 4, i32 4, i32 4>
@@ -13342,22 +13611,53 @@ define void @s_shuffle_v4p0_v3p0__4_4_4_4() {
 }
 
 define void @s_shuffle_v4p0_v3p0__5_4_4_4() {
-; GFX9-LABEL: s_shuffle_v4p0_v3p0__5_4_4_4:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s12
-; GFX9-NEXT:    s_mov_b32 s9, s13
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    s_mov_b32 s14, s10
-; GFX9-NEXT:    s_mov_b32 s15, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_4_4_4:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s12
+; GFX900-NEXT:    s_mov_b32 s9, s13
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    s_mov_b32 s14, s10
+; GFX900-NEXT:    s_mov_b32 s15, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_4_4_4:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s12
+; GFX90A-NEXT:    s_mov_b32 s9, s13
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s10
+; GFX90A-NEXT:    s_mov_b32 s15, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v3p0__5_4_4_4:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x ptr> asm "; def $0", "=s"()
   %vec1 = call <3 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> <i32 5, i32 4, i32 4, i32 4>
@@ -13402,12 +13702,9 @@ define void @s_shuffle_v4p0_v3p0__5_u_4_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -13466,15 +13763,12 @@ define void @s_shuffle_v4p0_v3p0__5_0_4_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s0
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:9]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
-; GFX942-NEXT:    s_mov_b32 s14, s6
-; GFX942-NEXT:    s_mov_b32 s15, s7
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -13532,12 +13826,9 @@ define void @s_shuffle_v4p0_v3p0__5_1_4_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -13599,12 +13890,9 @@ define void @s_shuffle_v4p0_v3p0__5_2_4_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s16
-; GFX942-NEXT:    s_mov_b32 s9, s17
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[16:17]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -13657,14 +13945,10 @@ define void @s_shuffle_v4p0_v3p0__5_3_4_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -13717,14 +14001,10 @@ define void @s_shuffle_v4p0_v3p0__5_5_4_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -13773,12 +14053,9 @@ define void @s_shuffle_v4p0_v3p0__5_5_u_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -13840,12 +14117,9 @@ define void @s_shuffle_v4p0_v3p0__5_5_0_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s16
-; GFX942-NEXT:    s_mov_b32 s9, s17
-; GFX942-NEXT:    s_mov_b32 s10, s16
-; GFX942-NEXT:    s_mov_b32 s11, s17
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[16:17]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[16:17]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -13907,12 +14181,9 @@ define void @s_shuffle_v4p0_v3p0__5_5_1_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s16
-; GFX942-NEXT:    s_mov_b32 s9, s17
-; GFX942-NEXT:    s_mov_b32 s10, s16
-; GFX942-NEXT:    s_mov_b32 s11, s17
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[16:17]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[16:17]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -13974,12 +14245,9 @@ define void @s_shuffle_v4p0_v3p0__5_5_2_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14028,14 +14296,10 @@ define void @s_shuffle_v4p0_v3p0__5_5_3_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14048,20 +14312,48 @@ define void @s_shuffle_v4p0_v3p0__5_5_3_4() {
 }
 
 define void @s_shuffle_v4p0_v3p0__u_5_5_5() {
-; GFX9-LABEL: s_shuffle_v4p0_v3p0__u_5_5_5:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    s_mov_b32 s14, s12
-; GFX9-NEXT:    s_mov_b32 s15, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4p0_v3p0__u_5_5_5:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v3p0__u_5_5_5:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v3p0__u_5_5_5:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x ptr> asm "; def $0", "=s"()
   %vec1 = call <3 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> <i32 poison, i32 5, i32 5, i32 5>
@@ -14119,12 +14411,9 @@ define void @s_shuffle_v4p0_v3p0__0_5_5_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s4
-; GFX942-NEXT:    s_mov_b32 s13, s5
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14186,12 +14475,9 @@ define void @s_shuffle_v4p0_v3p0__1_5_5_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s14, s12
-; GFX942-NEXT:    s_mov_b32 s15, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14253,12 +14539,9 @@ define void @s_shuffle_v4p0_v3p0__2_5_5_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s14, s12
-; GFX942-NEXT:    s_mov_b32 s15, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14271,20 +14554,48 @@ define void @s_shuffle_v4p0_v3p0__2_5_5_5() {
 }
 
 define void @s_shuffle_v4p0_v3p0__3_5_5_5() {
-; GFX9-LABEL: s_shuffle_v4p0_v3p0__3_5_5_5:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    s_mov_b32 s14, s12
-; GFX9-NEXT:    s_mov_b32 s15, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4p0_v3p0__3_5_5_5:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v3p0__3_5_5_5:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v3p0__3_5_5_5:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x ptr> asm "; def $0", "=s"()
   %vec1 = call <3 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> <i32 3, i32 5, i32 5, i32 5>
@@ -14293,22 +14604,53 @@ define void @s_shuffle_v4p0_v3p0__3_5_5_5() {
 }
 
 define void @s_shuffle_v4p0_v3p0__4_5_5_5() {
-; GFX9-LABEL: s_shuffle_v4p0_v3p0__4_5_5_5:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    s_mov_b32 s14, s12
-; GFX9-NEXT:    s_mov_b32 s15, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4p0_v3p0__4_5_5_5:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v3p0__4_5_5_5:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v3p0__4_5_5_5:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x ptr> asm "; def $0", "=s"()
   %vec1 = call <3 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> <i32 4, i32 5, i32 5, i32 5>
@@ -14317,20 +14659,48 @@ define void @s_shuffle_v4p0_v3p0__4_5_5_5() {
 }
 
 define void @s_shuffle_v4p0_v3p0__5_u_5_5() {
-; GFX9-LABEL: s_shuffle_v4p0_v3p0__5_u_5_5:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s12
-; GFX9-NEXT:    s_mov_b32 s9, s13
-; GFX9-NEXT:    s_mov_b32 s14, s12
-; GFX9-NEXT:    s_mov_b32 s15, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_u_5_5:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s12
+; GFX900-NEXT:    s_mov_b32 s9, s13
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_u_5_5:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s12
+; GFX90A-NEXT:    s_mov_b32 s9, s13
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v3p0__5_u_5_5:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x ptr> asm "; def $0", "=s"()
   %vec1 = call <3 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> <i32 5, i32 poison, i32 5, i32 5>
@@ -14388,12 +14758,9 @@ define void @s_shuffle_v4p0_v3p0__5_0_5_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s14, s12
-; GFX942-NEXT:    s_mov_b32 s15, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14455,12 +14822,9 @@ define void @s_shuffle_v4p0_v3p0__5_1_5_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s12, s4
-; GFX942-NEXT:    s_mov_b32 s13, s5
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14522,12 +14886,9 @@ define void @s_shuffle_v4p0_v3p0__5_2_5_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s14, s12
-; GFX942-NEXT:    s_mov_b32 s15, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14584,14 +14945,10 @@ define void @s_shuffle_v4p0_v3p0__5_3_5_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s4
-; GFX942-NEXT:    s_mov_b32 s13, s5
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14604,20 +14961,48 @@ define void @s_shuffle_v4p0_v3p0__5_3_5_5() {
 }
 
 define void @s_shuffle_v4p0_v3p0__5_4_5_5() {
-; GFX9-LABEL: s_shuffle_v4p0_v3p0__5_4_5_5:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s12
-; GFX9-NEXT:    s_mov_b32 s9, s13
-; GFX9-NEXT:    s_mov_b32 s14, s12
-; GFX9-NEXT:    s_mov_b32 s15, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_4_5_5:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s12
+; GFX900-NEXT:    s_mov_b32 s9, s13
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_4_5_5:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s12
+; GFX90A-NEXT:    s_mov_b32 s9, s13
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v3p0__5_4_5_5:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x ptr> asm "; def $0", "=s"()
   %vec1 = call <3 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> <i32 5, i32 4, i32 5, i32 5>
@@ -14666,12 +15051,9 @@ define void @s_shuffle_v4p0_v3p0__5_5_u_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14737,14 +15119,10 @@ define void @s_shuffle_v4p0_v3p0__5_5_0_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s16
-; GFX942-NEXT:    s_mov_b32 s9, s17
-; GFX942-NEXT:    s_mov_b32 s10, s16
-; GFX942-NEXT:    s_mov_b32 s11, s17
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s16
-; GFX942-NEXT:    s_mov_b32 s15, s17
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[16:17]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[16:17]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[16:17]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14810,14 +15188,10 @@ define void @s_shuffle_v4p0_v3p0__5_5_1_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s16
-; GFX942-NEXT:    s_mov_b32 s9, s17
-; GFX942-NEXT:    s_mov_b32 s10, s16
-; GFX942-NEXT:    s_mov_b32 s11, s17
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s16
-; GFX942-NEXT:    s_mov_b32 s15, s17
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[16:17]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[16:17]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[16:17]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14879,12 +15253,9 @@ define void @s_shuffle_v4p0_v3p0__5_5_2_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14937,14 +15308,10 @@ define void @s_shuffle_v4p0_v3p0__5_5_3_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -15001,14 +15368,10 @@ define void @s_shuffle_v4p0_v3p0__5_5_4_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v4p0.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v4p0.ll
index ce1c54129f706..8a73e474f4ff8 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v4p0.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v4p0.ll
@@ -87,6 +87,7 @@ define void @v_shuffle_v4p0_v4p0__1_u_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -126,6 +127,7 @@ define void @v_shuffle_v4p0_v4p0__2_u_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -165,10 +167,12 @@ define void @v_shuffle_v4p0_v4p0__3_u_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v4p0_v4p0__3_u_u_u:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_mov_b32_e32 v8, 0
+; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1] offset:16
+; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_mov_b32_e32 v0, v6
 ; GFX942-NEXT:    v_mov_b32_e32 v1, v7
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
@@ -221,6 +225,7 @@ define void @v_shuffle_v4p0_v4p0__5_u_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -261,6 +266,7 @@ define void @v_shuffle_v4p0_v4p0__6_u_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -301,10 +307,12 @@ define void @v_shuffle_v4p0_v4p0__7_u_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v4p0_v4p0__7_u_u_u:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_mov_b32_e32 v8, 0
+; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1] offset:16
+; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_mov_b32_e32 v0, v6
 ; GFX942-NEXT:    v_mov_b32_e32 v1, v7
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
@@ -364,11 +372,11 @@ define void @v_shuffle_v4p0_v4p0__7_0_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[2:9]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v8
-; GFX942-NEXT:    v_mov_b32_e32 v3, v9
+; GFX942-NEXT:    global_store_dwordx4 v10, v[0:3], s[0:1] offset:16
 ; GFX942-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX942-NEXT:    v_mov_b32_e32 v5, v1
+; GFX942-NEXT:    v_mov_b32_e32 v2, v8
+; GFX942-NEXT:    v_mov_b32_e32 v3, v9
 ; GFX942-NEXT:    global_store_dwordx4 v10, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -415,14 +423,15 @@ define void @v_shuffle_v4p0_v4p0__7_1_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v4p0_v4p0__7_1_u_u:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v12, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_mov_b32_e32 v12, 0
+; GFX942-NEXT:    global_store_dwordx4 v12, v[0:3], s[0:1] offset:16
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_mov_b32_e32 v0, v10
 ; GFX942-NEXT:    v_mov_b32_e32 v1, v11
 ; GFX942-NEXT:    global_store_dwordx4 v12, v[0:3], s[0:1]
@@ -471,14 +480,15 @@ define void @v_shuffle_v4p0_v4p0__7_2_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v4p0_v4p0__7_2_u_u:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v14, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_mov_b32_e32 v14, 0
+; GFX942-NEXT:    global_store_dwordx4 v14, v[0:3], s[0:1] offset:16
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[6:13]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_mov_b32_e32 v2, v12
 ; GFX942-NEXT:    v_mov_b32_e32 v3, v13
 ; GFX942-NEXT:    global_store_dwordx4 v14, v[2:5], s[0:1]
@@ -534,7 +544,7 @@ define void @v_shuffle_v4p0_v4p0__7_3_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    global_store_dwordx4 v16, v[0:3], s[0:1] offset:16
 ; GFX942-NEXT:    v_mov_b32_e32 v4, v14
 ; GFX942-NEXT:    v_mov_b32_e32 v5, v15
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1]
@@ -581,14 +591,15 @@ define void @v_shuffle_v4p0_v4p0__7_4_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v4p0_v4p0__7_4_u_u:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_mov_b32_e32 v8, 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v6
-; GFX942-NEXT:    v_mov_b32_e32 v3, v7
+; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1] offset:16
 ; GFX942-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX942-NEXT:    v_mov_b32_e32 v5, v1
+; GFX942-NEXT:    v_mov_b32_e32 v2, v6
+; GFX942-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -629,10 +640,12 @@ define void @v_shuffle_v4p0_v4p0__7_5_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v4p0_v4p0__7_5_u_u:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_mov_b32_e32 v8, 0
+; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1] offset:16
+; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_mov_b32_e32 v0, v6
 ; GFX942-NEXT:    v_mov_b32_e32 v1, v7
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
@@ -675,10 +688,12 @@ define void @v_shuffle_v4p0_v4p0__7_6_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v4p0_v4p0__7_6_u_u:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_mov_b32_e32 v8, 0
+; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1] offset:16
+; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_mov_b32_e32 v2, v6
 ; GFX942-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[2:5], s[0:1]
@@ -727,6 +742,7 @@ define void @v_shuffle_v4p0_v4p0__7_7_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-NEXT:    v_mov_b32_e32 v4, v6
 ; GFX942-NEXT:    v_mov_b32_e32 v5, v7
+; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -7703,6 +7719,7 @@ define void @v_shuffle_v4p0_v4p0__1_4_4_4(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -7742,6 +7759,7 @@ define void @v_shuffle_v4p0_v4p0__2_4_4_4(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -7781,10 +7799,12 @@ define void @v_shuffle_v4p0_v4p0__3_4_4_4(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v4p0_v4p0__3_4_4_4:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_mov_b32_e32 v8, 0
+; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1] offset:16
+; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_mov_b32_e32 v0, v6
 ; GFX942-NEXT:    v_mov_b32_e32 v1, v7
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
@@ -13289,8 +13309,7 @@ define void @s_shuffle_v4p0_v4p0__1_u_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -13330,8 +13349,7 @@ define void @s_shuffle_v4p0_v4p0__2_u_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -13375,8 +13393,7 @@ define void @s_shuffle_v4p0_v4p0__3_u_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -13434,8 +13451,7 @@ define void @s_shuffle_v4p0_v4p0__5_u_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -13476,8 +13492,7 @@ define void @s_shuffle_v4p0_v4p0__6_u_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -13522,8 +13537,7 @@ define void @s_shuffle_v4p0_v4p0__7_u_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -13582,10 +13596,8 @@ define void @s_shuffle_v4p0_v4p0__7_0_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -13639,8 +13651,7 @@ define void @s_shuffle_v4p0_v4p0__7_1_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -13698,10 +13709,8 @@ define void @s_shuffle_v4p0_v4p0__7_2_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -13755,10 +13764,8 @@ define void @s_shuffle_v4p0_v4p0__7_3_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -13807,10 +13814,8 @@ define void @s_shuffle_v4p0_v4p0__7_4_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -13823,18 +13828,43 @@ define void @s_shuffle_v4p0_v4p0__7_4_u_u() {
 }
 
 define void @s_shuffle_v4p0_v4p0__7_5_u_u() {
-; GFX9-LABEL: s_shuffle_v4p0_v4p0__7_5_u_u:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_5_u_u:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_5_u_u:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_5_u_u:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 5, i32 poison, i32 poison>
@@ -13879,10 +13909,8 @@ define void @s_shuffle_v4p0_v4p0__7_6_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -13927,10 +13955,8 @@ define void @s_shuffle_v4p0_v4p0__7_7_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -13989,13 +14015,11 @@ define void @s_shuffle_v4p0_v4p0__7_7_0_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s0
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14054,13 +14078,11 @@ define void @s_shuffle_v4p0_v4p0__7_7_1_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s2
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14114,10 +14136,8 @@ define void @s_shuffle_v4p0_v4p0__7_7_2_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14175,12 +14195,9 @@ define void @s_shuffle_v4p0_v4p0__7_7_3_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14229,12 +14246,9 @@ define void @s_shuffle_v4p0_v4p0__7_7_4_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14283,12 +14297,9 @@ define void @s_shuffle_v4p0_v4p0__7_7_5_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14301,20 +14312,48 @@ define void @s_shuffle_v4p0_v4p0__7_7_5_u() {
 }
 
 define void @s_shuffle_v4p0_v4p0__7_7_6_u() {
-; GFX9-LABEL: s_shuffle_v4p0_v4p0__7_7_6_u:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    s_mov_b32 s10, s14
-; GFX9-NEXT:    s_mov_b32 s11, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_6_u:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_6_u:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_6_u:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 6, i32 poison>
@@ -14363,12 +14402,9 @@ define void @s_shuffle_v4p0_v4p0__7_7_7_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14434,14 +14470,10 @@ define void @s_shuffle_v4p0_v4p0__7_7_7_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14507,14 +14539,10 @@ define void @s_shuffle_v4p0_v4p0__7_7_7_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14580,14 +14608,10 @@ define void @s_shuffle_v4p0_v4p0__7_7_7_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14649,12 +14673,9 @@ define void @s_shuffle_v4p0_v4p0__7_7_7_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14711,14 +14732,10 @@ define void @s_shuffle_v4p0_v4p0__7_7_7_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14771,14 +14788,10 @@ define void @s_shuffle_v4p0_v4p0__7_7_7_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14835,14 +14848,10 @@ define void @s_shuffle_v4p0_v4p0__7_7_7_6() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14855,22 +14864,53 @@ define void @s_shuffle_v4p0_v4p0__7_7_7_6() {
 }
 
 define void @s_shuffle_v4p0_v4p0__7_7_7_7() {
-; GFX9-LABEL: s_shuffle_v4p0_v4p0__7_7_7_7:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    s_mov_b32 s10, s14
-; GFX9-NEXT:    s_mov_b32 s11, s15
-; GFX9-NEXT:    s_mov_b32 s12, s14
-; GFX9-NEXT:    s_mov_b32 s13, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_7_7:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_7_7:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_7_7:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
@@ -14919,12 +14959,9 @@ define void @s_shuffle_v4p0_v4p0__u_0_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14936,22 +14973,53 @@ define void @s_shuffle_v4p0_v4p0__u_0_0_0() {
 }
 
 define void @s_shuffle_v4p0_v4p0__0_0_0_0() {
-; GFX9-LABEL: s_shuffle_v4p0_v4p0__0_0_0_0:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s8
-; GFX9-NEXT:    s_mov_b32 s11, s9
-; GFX9-NEXT:    s_mov_b32 s12, s8
-; GFX9-NEXT:    s_mov_b32 s13, s9
-; GFX9-NEXT:    s_mov_b32 s14, s8
-; GFX9-NEXT:    s_mov_b32 s15, s9
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__0_0_0_0:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s8
+; GFX900-NEXT:    s_mov_b32 s11, s9
+; GFX900-NEXT:    s_mov_b32 s12, s8
+; GFX900-NEXT:    s_mov_b32 s13, s9
+; GFX900-NEXT:    s_mov_b32 s14, s8
+; GFX900-NEXT:    s_mov_b32 s15, s9
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__0_0_0_0:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s8
+; GFX90A-NEXT:    s_mov_b32 s11, s9
+; GFX90A-NEXT:    s_mov_b32 s12, s8
+; GFX90A-NEXT:    s_mov_b32 s13, s9
+; GFX90A-NEXT:    s_mov_b32 s14, s8
+; GFX90A-NEXT:    s_mov_b32 s15, s9
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__0_0_0_0:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[8:9]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[8:9]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[8:9]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> zeroinitializer
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
@@ -15003,14 +15071,10 @@ define void @s_shuffle_v4p0_v4p0__1_0_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -15062,14 +15126,10 @@ define void @s_shuffle_v4p0_v4p0__2_0_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -15125,14 +15185,10 @@ define void @s_shuffle_v4p0_v4p0__3_0_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -15184,12 +15240,9 @@ define void @s_shuffle_v4p0_v4p0__4_0_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -15251,17 +15304,13 @@ define void @s_shuffle_v4p0_v4p0__5_0_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s0
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -15324,15 +15373,12 @@ define void @s_shuffle_v4p0_v4p0__6_0_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s0
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -15395,17 +15441,13 @@ define void @s_shuffle_v4p0_v4p0__7_0_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s0
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -15464,15 +15506,12 @@ define void @s_shuffle_v4p0_v4p0__7_u_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s0
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -15535,17 +15574,13 @@ define void @s_shuffle_v4p0_v4p0__7_1_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s0
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s10, s2
-; GFX942-NEXT:    s_mov_b32 s11, s3
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -15611,14 +15646,10 @@ define void @s_shuffle_v4p0_v4p0__7_2_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -15680,14 +15711,10 @@ define void @s_shuffle_v4p0_v4p0__7_3_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -15750,17 +15777,13 @@ define void @s_shuffle_v4p0_v4p0__7_4_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s0
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -15822,12 +15845,9 @@ define void @s_shuffle_v4p0_v4p0__7_5_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -15893,14 +15913,10 @@ define void @s_shuffle_v4p0_v4p0__7_6_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -15963,15 +15979,12 @@ define void @s_shuffle_v4p0_v4p0__7_7_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s0
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -16030,13 +16043,11 @@ define void @s_shuffle_v4p0_v4p0__7_7_u_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s14, s0
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -16099,15 +16110,12 @@ define void @s_shuffle_v4p0_v4p0__7_7_1_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s2
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -16169,14 +16177,10 @@ define void @s_shuffle_v4p0_v4p0__7_7_2_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
-; GFX942-NEXT:    s_mov_b32 s12, s4
-; GFX942-NEXT:    s_mov_b32 s13, s5
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -16238,14 +16242,10 @@ define void @s_shuffle_v4p0_v4p0__7_7_3_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -16304,15 +16304,12 @@ define void @s_shuffle_v4p0_v4p0__7_7_4_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s14, s0
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s12, s4
-; GFX942-NEXT:    s_mov_b32 s13, s5
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -16375,15 +16372,12 @@ define void @s_shuffle_v4p0_v4p0__7_7_5_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s14, s0
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -16445,12 +16439,9 @@ define void @s_shuffle_v4p0_v4p0__7_7_6_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -16463,149 +16454,12 @@ define void @s_shuffle_v4p0_v4p0__7_7_6_0() {
 }
 
 define void @s_shuffle_v4p0_v4p0__u_1_1_1() {
-; GFX9-LABEL: s_shuffle_v4p0_v4p0__u_1_1_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    s_mov_b32 s14, s10
-; GFX9-NEXT:    s_mov_b32 s15, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> <i32 poison, i32 1, i32 1, i32 1>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4p0_v4p0__0_1_1_1() {
-; GFX9-LABEL: s_shuffle_v4p0_v4p0__0_1_1_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    s_mov_b32 s14, s10
-; GFX9-NEXT:    s_mov_b32 s15, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> <i32 0, i32 1, i32 1, i32 1>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4p0_v4p0__1_1_1_1() {
-; GFX9-LABEL: s_shuffle_v4p0_v4p0__1_1_1_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    s_mov_b32 s14, s10
-; GFX9-NEXT:    s_mov_b32 s15, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4p0_v4p0__2_1_1_1() {
-; GFX9-LABEL: s_shuffle_v4p0_v4p0__2_1_1_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s12
-; GFX9-NEXT:    s_mov_b32 s9, s13
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    s_mov_b32 s14, s10
-; GFX9-NEXT:    s_mov_b32 s15, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> <i32 2, i32 1, i32 1, i32 1>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4p0_v4p0__3_1_1_1() {
-; GFX9-LABEL: s_shuffle_v4p0_v4p0__3_1_1_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    s_mov_b32 s14, s10
-; GFX9-NEXT:    s_mov_b32 s15, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> <i32 3, i32 1, i32 1, i32 1>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4p0_v4p0__4_1_1_1() {
-; GFX9-LABEL: s_shuffle_v4p0_v4p0__4_1_1_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    s_mov_b32 s14, s10
-; GFX9-NEXT:    s_mov_b32 s15, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> <i32 4, i32 1, i32 1, i32 1>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4p0_v4p0__5_1_1_1() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__5_1_1_1:
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__u_1_1_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s14
-; GFX900-NEXT:    s_mov_b32 s9, s15
 ; GFX900-NEXT:    s_mov_b32 s12, s10
 ; GFX900-NEXT:    s_mov_b32 s13, s11
 ; GFX900-NEXT:    s_mov_b32 s14, s10
@@ -16615,17 +16469,12 @@ define void @s_shuffle_v4p0_v4p0__5_1_1_1() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__5_1_1_1:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__u_1_1_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s14
-; GFX90A-NEXT:    s_mov_b32 s9, s15
 ; GFX90A-NEXT:    s_mov_b32 s12, s10
 ; GFX90A-NEXT:    s_mov_b32 s13, s11
 ; GFX90A-NEXT:    s_mov_b32 s14, s10
@@ -16635,44 +16484,31 @@ define void @s_shuffle_v4p0_v4p0__5_1_1_1() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__5_1_1_1:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__u_1_1_1:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s12, s10
-; GFX942-NEXT:    s_mov_b32 s13, s11
-; GFX942-NEXT:    s_mov_b32 s14, s10
-; GFX942-NEXT:    s_mov_b32 s15, s11
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 5, i32 1, i32 1, i32 1>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> <i32 poison, i32 1, i32 1, i32 1>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__6_1_1_1() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__6_1_1_1:
+define void @s_shuffle_v4p0_v4p0__0_1_1_1() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__0_1_1_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s16
-; GFX900-NEXT:    s_mov_b32 s9, s17
 ; GFX900-NEXT:    s_mov_b32 s12, s10
 ; GFX900-NEXT:    s_mov_b32 s13, s11
 ; GFX900-NEXT:    s_mov_b32 s14, s10
@@ -16682,17 +16518,12 @@ define void @s_shuffle_v4p0_v4p0__6_1_1_1() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__6_1_1_1:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__0_1_1_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s16
-; GFX90A-NEXT:    s_mov_b32 s9, s17
 ; GFX90A-NEXT:    s_mov_b32 s12, s10
 ; GFX90A-NEXT:    s_mov_b32 s13, s11
 ; GFX90A-NEXT:    s_mov_b32 s14, s10
@@ -16702,44 +16533,33 @@ define void @s_shuffle_v4p0_v4p0__6_1_1_1() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__6_1_1_1:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__0_1_1_1:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s12, s10
-; GFX942-NEXT:    s_mov_b32 s13, s11
-; GFX942-NEXT:    s_mov_b32 s14, s10
-; GFX942-NEXT:    s_mov_b32 s15, s11
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 6, i32 1, i32 1, i32 1>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> <i32 0, i32 1, i32 1, i32 1>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__7_1_1_1() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_1_1_1:
+define void @s_shuffle_v4p0_v4p0__1_1_1_1() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__1_1_1_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s18
-; GFX900-NEXT:    s_mov_b32 s9, s19
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
 ; GFX900-NEXT:    s_mov_b32 s12, s10
 ; GFX900-NEXT:    s_mov_b32 s13, s11
 ; GFX900-NEXT:    s_mov_b32 s14, s10
@@ -16749,17 +16569,14 @@ define void @s_shuffle_v4p0_v4p0__7_1_1_1() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_1_1_1:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__1_1_1_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s18
-; GFX90A-NEXT:    s_mov_b32 s9, s19
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
 ; GFX90A-NEXT:    s_mov_b32 s12, s10
 ; GFX90A-NEXT:    s_mov_b32 s13, s11
 ; GFX90A-NEXT:    s_mov_b32 s14, s10
@@ -16769,283 +16586,224 @@ define void @s_shuffle_v4p0_v4p0__7_1_1_1() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_1_1_1:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__1_1_1_1:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s12, s10
-; GFX942-NEXT:    s_mov_b32 s13, s11
-; GFX942-NEXT:    s_mov_b32 s14, s10
-; GFX942-NEXT:    s_mov_b32 s15, s11
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 1, i32 1, i32 1>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__7_u_1_1() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_u_1_1:
+define void @s_shuffle_v4p0_v4p0__2_1_1_1() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__2_1_1_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s14
-; GFX900-NEXT:    s_mov_b32 s9, s15
-; GFX900-NEXT:    s_mov_b32 s12, s6
-; GFX900-NEXT:    s_mov_b32 s13, s7
-; GFX900-NEXT:    s_mov_b32 s14, s6
-; GFX900-NEXT:    s_mov_b32 s15, s7
+; GFX900-NEXT:    s_mov_b32 s8, s12
+; GFX900-NEXT:    s_mov_b32 s9, s13
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    s_mov_b32 s14, s10
+; GFX900-NEXT:    s_mov_b32 s15, s11
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_u_1_1:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__2_1_1_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s14
-; GFX90A-NEXT:    s_mov_b32 s9, s15
-; GFX90A-NEXT:    s_mov_b32 s12, s6
-; GFX90A-NEXT:    s_mov_b32 s13, s7
-; GFX90A-NEXT:    s_mov_b32 s14, s6
-; GFX90A-NEXT:    s_mov_b32 s15, s7
+; GFX90A-NEXT:    s_mov_b32 s8, s12
+; GFX90A-NEXT:    s_mov_b32 s9, s13
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s10
+; GFX90A-NEXT:    s_mov_b32 s15, s11
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_u_1_1:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__2_1_1_1:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[4:11]
+; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 poison, i32 1, i32 1>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> <i32 2, i32 1, i32 1, i32 1>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__7_0_1_1() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_0_1_1:
+define void @s_shuffle_v4p0_v4p0__3_1_1_1() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__3_1_1_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s14
 ; GFX900-NEXT:    s_mov_b32 s9, s15
-; GFX900-NEXT:    s_mov_b32 s10, s4
-; GFX900-NEXT:    s_mov_b32 s11, s5
-; GFX900-NEXT:    s_mov_b32 s12, s6
-; GFX900-NEXT:    s_mov_b32 s13, s7
-; GFX900-NEXT:    s_mov_b32 s14, s6
-; GFX900-NEXT:    s_mov_b32 s15, s7
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    s_mov_b32 s14, s10
+; GFX900-NEXT:    s_mov_b32 s15, s11
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_0_1_1:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__3_1_1_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s14
 ; GFX90A-NEXT:    s_mov_b32 s9, s15
-; GFX90A-NEXT:    s_mov_b32 s10, s4
-; GFX90A-NEXT:    s_mov_b32 s11, s5
-; GFX90A-NEXT:    s_mov_b32 s12, s6
-; GFX90A-NEXT:    s_mov_b32 s13, s7
-; GFX90A-NEXT:    s_mov_b32 s14, s6
-; GFX90A-NEXT:    s_mov_b32 s15, s7
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s10
+; GFX90A-NEXT:    s_mov_b32 s15, s11
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_0_1_1:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__3_1_1_1:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[4:11]
+; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 0, i32 1, i32 1>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> <i32 3, i32 1, i32 1, i32 1>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__7_2_1_1() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_2_1_1:
+define void @s_shuffle_v4p0_v4p0__4_1_1_1() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__4_1_1_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s10, s16
-; GFX900-NEXT:    s_mov_b32 s11, s17
-; GFX900-NEXT:    s_mov_b32 s12, s14
-; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    s_mov_b32 s14, s10
+; GFX900-NEXT:    s_mov_b32 s15, s11
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_2_1_1:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__4_1_1_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s10, s16
-; GFX90A-NEXT:    s_mov_b32 s11, s17
-; GFX90A-NEXT:    s_mov_b32 s12, s14
-; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s10
+; GFX90A-NEXT:    s_mov_b32 s15, s11
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_2_1_1:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__4_1_1_1:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 2, i32 1, i32 1>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> <i32 4, i32 1, i32 1, i32 1>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__7_3_1_1() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_3_1_1:
+define void @s_shuffle_v4p0_v4p0__5_1_1_1() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__5_1_1_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s18
-; GFX900-NEXT:    s_mov_b32 s9, s19
-; GFX900-NEXT:    s_mov_b32 s12, s6
-; GFX900-NEXT:    s_mov_b32 s13, s7
-; GFX900-NEXT:    s_mov_b32 s14, s6
-; GFX900-NEXT:    s_mov_b32 s15, s7
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    s_mov_b32 s14, s10
+; GFX900-NEXT:    s_mov_b32 s15, s11
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_3_1_1:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__5_1_1_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s18
-; GFX90A-NEXT:    s_mov_b32 s9, s19
-; GFX90A-NEXT:    s_mov_b32 s12, s6
-; GFX90A-NEXT:    s_mov_b32 s13, s7
-; GFX90A-NEXT:    s_mov_b32 s14, s6
-; GFX90A-NEXT:    s_mov_b32 s15, s7
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s10
+; GFX90A-NEXT:    s_mov_b32 s15, s11
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_3_1_1:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__5_1_1_1:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -17054,140 +16812,126 @@ define void @s_shuffle_v4p0_v4p0__7_3_1_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 3, i32 1, i32 1>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 5, i32 1, i32 1, i32 1>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__7_4_1_1() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_4_1_1:
+define void @s_shuffle_v4p0_v4p0__6_1_1_1() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__6_1_1_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s18
-; GFX900-NEXT:    s_mov_b32 s9, s19
-; GFX900-NEXT:    s_mov_b32 s10, s12
-; GFX900-NEXT:    s_mov_b32 s11, s13
-; GFX900-NEXT:    s_mov_b32 s12, s6
-; GFX900-NEXT:    s_mov_b32 s13, s7
-; GFX900-NEXT:    s_mov_b32 s14, s6
-; GFX900-NEXT:    s_mov_b32 s15, s7
+; GFX900-NEXT:    s_mov_b32 s8, s16
+; GFX900-NEXT:    s_mov_b32 s9, s17
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    s_mov_b32 s14, s10
+; GFX900-NEXT:    s_mov_b32 s15, s11
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_4_1_1:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__6_1_1_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s18
-; GFX90A-NEXT:    s_mov_b32 s9, s19
-; GFX90A-NEXT:    s_mov_b32 s10, s12
-; GFX90A-NEXT:    s_mov_b32 s11, s13
-; GFX90A-NEXT:    s_mov_b32 s12, s6
-; GFX90A-NEXT:    s_mov_b32 s13, s7
-; GFX90A-NEXT:    s_mov_b32 s14, s6
-; GFX90A-NEXT:    s_mov_b32 s15, s7
+; GFX90A-NEXT:    s_mov_b32 s8, s16
+; GFX90A-NEXT:    s_mov_b32 s9, s17
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s10
+; GFX90A-NEXT:    s_mov_b32 s15, s11
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_4_1_1:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__6_1_1_1:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s2
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[4:11]
+; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 4, i32 1, i32 1>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 6, i32 1, i32 1, i32 1>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__7_5_1_1() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_5_1_1:
+define void @s_shuffle_v4p0_v4p0__7_1_1_1() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_1_1_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s14
-; GFX900-NEXT:    s_mov_b32 s9, s15
-; GFX900-NEXT:    s_mov_b32 s12, s6
-; GFX900-NEXT:    s_mov_b32 s13, s7
-; GFX900-NEXT:    s_mov_b32 s14, s6
-; GFX900-NEXT:    s_mov_b32 s15, s7
+; GFX900-NEXT:    s_mov_b32 s8, s18
+; GFX900-NEXT:    s_mov_b32 s9, s19
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    s_mov_b32 s14, s10
+; GFX900-NEXT:    s_mov_b32 s15, s11
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_5_1_1:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_1_1_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s14
-; GFX90A-NEXT:    s_mov_b32 s9, s15
-; GFX90A-NEXT:    s_mov_b32 s12, s6
-; GFX90A-NEXT:    s_mov_b32 s13, s7
-; GFX90A-NEXT:    s_mov_b32 s14, s6
-; GFX90A-NEXT:    s_mov_b32 s15, s7
+; GFX90A-NEXT:    s_mov_b32 s8, s18
+; GFX90A-NEXT:    s_mov_b32 s9, s19
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s10
+; GFX90A-NEXT:    s_mov_b32 s15, s11
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_5_1_1:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_1_1_1:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -17196,25 +16940,22 @@ define void @s_shuffle_v4p0_v4p0__7_5_1_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 5, i32 1, i32 1>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 1, i32 1, i32 1>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__7_6_1_1() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_6_1_1:
+define void @s_shuffle_v4p0_v4p0__7_u_1_1() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_u_1_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -17225,8 +16966,6 @@ define void @s_shuffle_v4p0_v4p0__7_6_1_1() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s14
 ; GFX900-NEXT:    s_mov_b32 s9, s15
-; GFX900-NEXT:    s_mov_b32 s10, s12
-; GFX900-NEXT:    s_mov_b32 s11, s13
 ; GFX900-NEXT:    s_mov_b32 s12, s6
 ; GFX900-NEXT:    s_mov_b32 s13, s7
 ; GFX900-NEXT:    s_mov_b32 s14, s6
@@ -17236,7 +16975,7 @@ define void @s_shuffle_v4p0_v4p0__7_6_1_1() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_6_1_1:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_u_1_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -17247,8 +16986,6 @@ define void @s_shuffle_v4p0_v4p0__7_6_1_1() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s14
 ; GFX90A-NEXT:    s_mov_b32 s9, s15
-; GFX90A-NEXT:    s_mov_b32 s10, s12
-; GFX90A-NEXT:    s_mov_b32 s11, s13
 ; GFX90A-NEXT:    s_mov_b32 s12, s6
 ; GFX90A-NEXT:    s_mov_b32 s13, s7
 ; GFX90A-NEXT:    s_mov_b32 s14, s6
@@ -17258,36 +16995,31 @@ define void @s_shuffle_v4p0_v4p0__7_6_1_1() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_6_1_1:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_u_1_1:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 6, i32 1, i32 1>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 poison, i32 1, i32 1>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__7_7_1_1() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_1_1:
+define void @s_shuffle_v4p0_v4p0__7_0_1_1() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_0_1_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -17298,8 +17030,8 @@ define void @s_shuffle_v4p0_v4p0__7_7_1_1() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s14
 ; GFX900-NEXT:    s_mov_b32 s9, s15
-; GFX900-NEXT:    s_mov_b32 s10, s14
-; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    s_mov_b32 s10, s4
+; GFX900-NEXT:    s_mov_b32 s11, s5
 ; GFX900-NEXT:    s_mov_b32 s12, s6
 ; GFX900-NEXT:    s_mov_b32 s13, s7
 ; GFX900-NEXT:    s_mov_b32 s14, s6
@@ -17309,7 +17041,7 @@ define void @s_shuffle_v4p0_v4p0__7_7_1_1() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_1_1:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_0_1_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -17320,8 +17052,8 @@ define void @s_shuffle_v4p0_v4p0__7_7_1_1() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s14
 ; GFX90A-NEXT:    s_mov_b32 s9, s15
-; GFX90A-NEXT:    s_mov_b32 s10, s14
-; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    s_mov_b32 s10, s4
+; GFX90A-NEXT:    s_mov_b32 s11, s5
 ; GFX90A-NEXT:    s_mov_b32 s12, s6
 ; GFX90A-NEXT:    s_mov_b32 s13, s7
 ; GFX90A-NEXT:    s_mov_b32 s14, s6
@@ -17331,113 +17063,109 @@ define void @s_shuffle_v4p0_v4p0__7_7_1_1() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_1_1:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_0_1_1:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s2
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 1, i32 1>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 0, i32 1, i32 1>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__7_7_u_1() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_u_1:
+define void @s_shuffle_v4p0_v4p0__7_2_1_1() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_2_1_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s14
-; GFX900-NEXT:    s_mov_b32 s9, s15
-; GFX900-NEXT:    s_mov_b32 s10, s14
-; GFX900-NEXT:    s_mov_b32 s11, s15
-; GFX900-NEXT:    s_mov_b32 s14, s6
-; GFX900-NEXT:    s_mov_b32 s15, s7
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s10, s16
+; GFX900-NEXT:    s_mov_b32 s11, s17
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_u_1:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_2_1_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s14
-; GFX90A-NEXT:    s_mov_b32 s9, s15
-; GFX90A-NEXT:    s_mov_b32 s10, s14
-; GFX90A-NEXT:    s_mov_b32 s11, s15
-; GFX90A-NEXT:    s_mov_b32 s14, s6
-; GFX90A-NEXT:    s_mov_b32 s15, s7
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s10, s16
+; GFX90A-NEXT:    s_mov_b32 s11, s17
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_u_1:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_2_1_1:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s14, s2
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[4:11]
+; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 poison, i32 1>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 2, i32 1, i32 1>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__7_7_0_1() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_0_1:
+define void @s_shuffle_v4p0_v4p0__7_3_1_1() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_3_1_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s14
-; GFX900-NEXT:    s_mov_b32 s9, s15
-; GFX900-NEXT:    s_mov_b32 s10, s14
-; GFX900-NEXT:    s_mov_b32 s11, s15
-; GFX900-NEXT:    s_mov_b32 s12, s4
-; GFX900-NEXT:    s_mov_b32 s13, s5
+; GFX900-NEXT:    s_mov_b32 s8, s18
+; GFX900-NEXT:    s_mov_b32 s9, s19
+; GFX900-NEXT:    s_mov_b32 s12, s6
+; GFX900-NEXT:    s_mov_b32 s13, s7
 ; GFX900-NEXT:    s_mov_b32 s14, s6
 ; GFX900-NEXT:    s_mov_b32 s15, s7
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -17445,21 +17173,19 @@ define void @s_shuffle_v4p0_v4p0__7_7_0_1() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_0_1:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_3_1_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s14
-; GFX90A-NEXT:    s_mov_b32 s9, s15
-; GFX90A-NEXT:    s_mov_b32 s10, s14
-; GFX90A-NEXT:    s_mov_b32 s11, s15
-; GFX90A-NEXT:    s_mov_b32 s12, s4
-; GFX90A-NEXT:    s_mov_b32 s13, s5
+; GFX90A-NEXT:    s_mov_b32 s8, s18
+; GFX90A-NEXT:    s_mov_b32 s9, s19
+; GFX90A-NEXT:    s_mov_b32 s12, s6
+; GFX90A-NEXT:    s_mov_b32 s13, s7
 ; GFX90A-NEXT:    s_mov_b32 s14, s6
 ; GFX90A-NEXT:    s_mov_b32 s15, s7
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -17467,135 +17193,141 @@ define void @s_shuffle_v4p0_v4p0__7_7_0_1() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_0_1:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_3_1_1:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s0
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[4:11]
+; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 0, i32 1>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 3, i32 1, i32 1>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__7_7_2_1() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_2_1:
+define void @s_shuffle_v4p0_v4p0__7_4_1_1() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_4_1_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s12, s16
-; GFX900-NEXT:    s_mov_b32 s13, s17
+; GFX900-NEXT:    s_mov_b32 s8, s18
+; GFX900-NEXT:    s_mov_b32 s9, s19
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s12, s6
+; GFX900-NEXT:    s_mov_b32 s13, s7
+; GFX900-NEXT:    s_mov_b32 s14, s6
+; GFX900-NEXT:    s_mov_b32 s15, s7
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_2_1:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_4_1_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s12, s16
-; GFX90A-NEXT:    s_mov_b32 s13, s17
+; GFX90A-NEXT:    s_mov_b32 s8, s18
+; GFX90A-NEXT:    s_mov_b32 s9, s19
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s12, s6
+; GFX90A-NEXT:    s_mov_b32 s13, s7
+; GFX90A-NEXT:    s_mov_b32 s14, s6
+; GFX90A-NEXT:    s_mov_b32 s15, s7
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_2_1:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_4_1_1:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
-; GFX942-NEXT:    s_mov_b32 s12, s4
-; GFX942-NEXT:    s_mov_b32 s13, s5
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 2, i32 1>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 4, i32 1, i32 1>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__7_7_3_1() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_3_1:
+define void @s_shuffle_v4p0_v4p0__7_5_1_1() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_5_1_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s12, s18
-; GFX900-NEXT:    s_mov_b32 s13, s19
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s12, s6
+; GFX900-NEXT:    s_mov_b32 s13, s7
+; GFX900-NEXT:    s_mov_b32 s14, s6
+; GFX900-NEXT:    s_mov_b32 s15, s7
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_3_1:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_5_1_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s12, s18
-; GFX90A-NEXT:    s_mov_b32 s13, s19
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s6
+; GFX90A-NEXT:    s_mov_b32 s13, s7
+; GFX90A-NEXT:    s_mov_b32 s14, s6
+; GFX90A-NEXT:    s_mov_b32 s15, s7
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_3_1:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_5_1_1:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -17604,39 +17336,36 @@ define void @s_shuffle_v4p0_v4p0__7_7_3_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 3, i32 1>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 5, i32 1, i32 1>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__7_7_4_1() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_4_1:
+define void @s_shuffle_v4p0_v4p0__7_6_1_1() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_6_1_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s18
-; GFX900-NEXT:    s_mov_b32 s9, s19
-; GFX900-NEXT:    s_mov_b32 s10, s18
-; GFX900-NEXT:    s_mov_b32 s11, s19
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s12, s6
+; GFX900-NEXT:    s_mov_b32 s13, s7
 ; GFX900-NEXT:    s_mov_b32 s14, s6
 ; GFX900-NEXT:    s_mov_b32 s15, s7
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -17644,19 +17373,21 @@ define void @s_shuffle_v4p0_v4p0__7_7_4_1() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_4_1:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_6_1_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s18
-; GFX90A-NEXT:    s_mov_b32 s9, s19
-; GFX90A-NEXT:    s_mov_b32 s10, s18
-; GFX90A-NEXT:    s_mov_b32 s11, s19
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s12, s6
+; GFX90A-NEXT:    s_mov_b32 s13, s7
 ; GFX90A-NEXT:    s_mov_b32 s14, s6
 ; GFX90A-NEXT:    s_mov_b32 s15, s7
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -17664,48 +17395,46 @@ define void @s_shuffle_v4p0_v4p0__7_7_4_1() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_4_1:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_6_1_1:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s14, s2
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[4:11]
+; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s12, s4
-; GFX942-NEXT:    s_mov_b32 s13, s5
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 4, i32 1>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 6, i32 1, i32 1>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__7_7_5_1() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_5_1:
+define void @s_shuffle_v4p0_v4p0__7_7_1_1() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_1_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s18
-; GFX900-NEXT:    s_mov_b32 s9, s19
-; GFX900-NEXT:    s_mov_b32 s10, s18
-; GFX900-NEXT:    s_mov_b32 s11, s19
-; GFX900-NEXT:    s_mov_b32 s12, s14
-; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    s_mov_b32 s12, s6
+; GFX900-NEXT:    s_mov_b32 s13, s7
 ; GFX900-NEXT:    s_mov_b32 s14, s6
 ; GFX900-NEXT:    s_mov_b32 s15, s7
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -17713,21 +17442,21 @@ define void @s_shuffle_v4p0_v4p0__7_7_5_1() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_5_1:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_1_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s18
-; GFX90A-NEXT:    s_mov_b32 s9, s19
-; GFX90A-NEXT:    s_mov_b32 s10, s18
-; GFX90A-NEXT:    s_mov_b32 s11, s19
-; GFX90A-NEXT:    s_mov_b32 s12, s14
-; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s6
+; GFX90A-NEXT:    s_mov_b32 s13, s7
 ; GFX90A-NEXT:    s_mov_b32 s14, s6
 ; GFX90A-NEXT:    s_mov_b32 s15, s7
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -17735,34 +17464,31 @@ define void @s_shuffle_v4p0_v4p0__7_7_5_1() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_5_1:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_1_1:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s14, s2
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 5, i32 1>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 1, i32 1>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__7_7_6_1() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_6_1:
+define void @s_shuffle_v4p0_v4p0__7_7_u_1() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_u_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -17782,7 +17508,7 @@ define void @s_shuffle_v4p0_v4p0__7_7_6_1() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_6_1:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_u_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -17802,269 +17528,134 @@ define void @s_shuffle_v4p0_v4p0__7_7_6_1() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_6_1:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_u_1:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 6, i32 1>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4p0_v4p0__u_2_2_2() {
-; GFX9-LABEL: s_shuffle_v4p0_v4p0__u_2_2_2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    s_mov_b32 s14, s12
-; GFX9-NEXT:    s_mov_b32 s15, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> <i32 poison, i32 2, i32 2, i32 2>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4p0_v4p0__0_2_2_2() {
-; GFX9-LABEL: s_shuffle_v4p0_v4p0__0_2_2_2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    s_mov_b32 s14, s12
-; GFX9-NEXT:    s_mov_b32 s15, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> <i32 0, i32 2, i32 2, i32 2>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4p0_v4p0__1_2_2_2() {
-; GFX9-LABEL: s_shuffle_v4p0_v4p0__1_2_2_2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    s_mov_b32 s14, s12
-; GFX9-NEXT:    s_mov_b32 s15, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> <i32 1, i32 2, i32 2, i32 2>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4p0_v4p0__2_2_2_2() {
-; GFX9-LABEL: s_shuffle_v4p0_v4p0__2_2_2_2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s12
-; GFX9-NEXT:    s_mov_b32 s9, s13
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    s_mov_b32 s14, s12
-; GFX9-NEXT:    s_mov_b32 s15, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4p0_v4p0__3_2_2_2() {
-; GFX9-LABEL: s_shuffle_v4p0_v4p0__3_2_2_2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    s_mov_b32 s14, s12
-; GFX9-NEXT:    s_mov_b32 s15, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> <i32 3, i32 2, i32 2, i32 2>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4p0_v4p0__4_2_2_2() {
-; GFX9-LABEL: s_shuffle_v4p0_v4p0__4_2_2_2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    s_mov_b32 s14, s12
-; GFX9-NEXT:    s_mov_b32 s15, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> <i32 4, i32 2, i32 2, i32 2>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 poison, i32 1>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__5_2_2_2() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__5_2_2_2:
+define void @s_shuffle_v4p0_v4p0__7_7_0_1() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_0_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s6
-; GFX900-NEXT:    s_mov_b32 s9, s7
-; GFX900-NEXT:    s_mov_b32 s10, s12
-; GFX900-NEXT:    s_mov_b32 s11, s13
-; GFX900-NEXT:    s_mov_b32 s14, s12
-; GFX900-NEXT:    s_mov_b32 s15, s13
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    s_mov_b32 s12, s4
+; GFX900-NEXT:    s_mov_b32 s13, s5
+; GFX900-NEXT:    s_mov_b32 s14, s6
+; GFX900-NEXT:    s_mov_b32 s15, s7
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__5_2_2_2:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_0_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s6
-; GFX90A-NEXT:    s_mov_b32 s9, s7
-; GFX90A-NEXT:    s_mov_b32 s10, s12
-; GFX90A-NEXT:    s_mov_b32 s11, s13
-; GFX90A-NEXT:    s_mov_b32 s14, s12
-; GFX90A-NEXT:    s_mov_b32 s15, s13
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s4
+; GFX90A-NEXT:    s_mov_b32 s13, s5
+; GFX90A-NEXT:    s_mov_b32 s14, s6
+; GFX90A-NEXT:    s_mov_b32 s15, s7
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__5_2_2_2:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_0_1:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s14, s12
-; GFX942-NEXT:    s_mov_b32 s15, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 5, i32 2, i32 2, i32 2>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 0, i32 1>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__6_2_2_2() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__6_2_2_2:
+define void @s_shuffle_v4p0_v4p0__7_7_2_1() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_2_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s10, s12
-; GFX900-NEXT:    s_mov_b32 s11, s13
-; GFX900-NEXT:    s_mov_b32 s14, s12
-; GFX900-NEXT:    s_mov_b32 s15, s13
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s12, s16
+; GFX900-NEXT:    s_mov_b32 s13, s17
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__6_2_2_2:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_2_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s10, s12
-; GFX90A-NEXT:    s_mov_b32 s11, s13
-; GFX90A-NEXT:    s_mov_b32 s14, s12
-; GFX90A-NEXT:    s_mov_b32 s15, s13
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s12, s16
+; GFX90A-NEXT:    s_mov_b32 s13, s17
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__6_2_2_2:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_2_1:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -18073,65 +17664,59 @@ define void @s_shuffle_v4p0_v4p0__6_2_2_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s14, s12
-; GFX942-NEXT:    s_mov_b32 s15, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 6, i32 2, i32 2, i32 2>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 2, i32 1>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__7_2_2_2() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_2_2_2:
+define void @s_shuffle_v4p0_v4p0__7_7_3_1() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_3_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s10
 ; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s10, s12
-; GFX900-NEXT:    s_mov_b32 s11, s13
-; GFX900-NEXT:    s_mov_b32 s14, s12
-; GFX900-NEXT:    s_mov_b32 s15, s13
+; GFX900-NEXT:    s_mov_b32 s12, s18
+; GFX900-NEXT:    s_mov_b32 s13, s19
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_2_2_2:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_3_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s10
 ; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s10, s12
-; GFX90A-NEXT:    s_mov_b32 s11, s13
-; GFX90A-NEXT:    s_mov_b32 s14, s12
-; GFX90A-NEXT:    s_mov_b32 s15, s13
+; GFX90A-NEXT:    s_mov_b32 s12, s18
+; GFX90A-NEXT:    s_mov_b32 s13, s19
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_2_2_2:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_3_1:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -18140,195 +17725,195 @@ define void @s_shuffle_v4p0_v4p0__7_2_2_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s14, s12
-; GFX942-NEXT:    s_mov_b32 s15, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 2, i32 2, i32 2>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 3, i32 1>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__7_u_2_2() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_u_2_2:
+define void @s_shuffle_v4p0_v4p0__7_7_4_1() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_4_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s14, s12
-; GFX900-NEXT:    s_mov_b32 s15, s13
+; GFX900-NEXT:    s_mov_b32 s8, s18
+; GFX900-NEXT:    s_mov_b32 s9, s19
+; GFX900-NEXT:    s_mov_b32 s10, s18
+; GFX900-NEXT:    s_mov_b32 s11, s19
+; GFX900-NEXT:    s_mov_b32 s14, s6
+; GFX900-NEXT:    s_mov_b32 s15, s7
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_u_2_2:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_4_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s14, s12
-; GFX90A-NEXT:    s_mov_b32 s15, s13
+; GFX90A-NEXT:    s_mov_b32 s8, s18
+; GFX90A-NEXT:    s_mov_b32 s9, s19
+; GFX90A-NEXT:    s_mov_b32 s10, s18
+; GFX90A-NEXT:    s_mov_b32 s11, s19
+; GFX90A-NEXT:    s_mov_b32 s14, s6
+; GFX90A-NEXT:    s_mov_b32 s15, s7
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_u_2_2:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_4_1:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s14, s12
-; GFX942-NEXT:    s_mov_b32 s15, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 poison, i32 2, i32 2>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 4, i32 1>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__7_0_2_2() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_0_2_2:
+define void @s_shuffle_v4p0_v4p0__7_7_5_1() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_5_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s10, s12
-; GFX900-NEXT:    s_mov_b32 s11, s13
-; GFX900-NEXT:    s_mov_b32 s12, s16
-; GFX900-NEXT:    s_mov_b32 s13, s17
-; GFX900-NEXT:    s_mov_b32 s14, s16
-; GFX900-NEXT:    s_mov_b32 s15, s17
+; GFX900-NEXT:    s_mov_b32 s8, s18
+; GFX900-NEXT:    s_mov_b32 s9, s19
+; GFX900-NEXT:    s_mov_b32 s10, s18
+; GFX900-NEXT:    s_mov_b32 s11, s19
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    s_mov_b32 s14, s6
+; GFX900-NEXT:    s_mov_b32 s15, s7
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_0_2_2:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_5_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s10, s12
-; GFX90A-NEXT:    s_mov_b32 s11, s13
-; GFX90A-NEXT:    s_mov_b32 s12, s16
-; GFX90A-NEXT:    s_mov_b32 s13, s17
-; GFX90A-NEXT:    s_mov_b32 s14, s16
-; GFX90A-NEXT:    s_mov_b32 s15, s17
+; GFX90A-NEXT:    s_mov_b32 s8, s18
+; GFX90A-NEXT:    s_mov_b32 s9, s19
+; GFX90A-NEXT:    s_mov_b32 s10, s18
+; GFX90A-NEXT:    s_mov_b32 s11, s19
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    s_mov_b32 s14, s6
+; GFX90A-NEXT:    s_mov_b32 s15, s7
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_0_2_2:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_5_1:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s4
-; GFX942-NEXT:    s_mov_b32 s13, s5
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 0, i32 2, i32 2>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 5, i32 1>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__7_1_2_2() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_1_2_2:
+define void @s_shuffle_v4p0_v4p0__7_7_6_1() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_6_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[16:23]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s22
-; GFX900-NEXT:    s_mov_b32 s9, s23
-; GFX900-NEXT:    s_mov_b32 s14, s12
-; GFX900-NEXT:    s_mov_b32 s15, s13
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    s_mov_b32 s14, s6
+; GFX900-NEXT:    s_mov_b32 s15, s7
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_1_2_2:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_6_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[16:23]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s22
-; GFX90A-NEXT:    s_mov_b32 s9, s23
-; GFX90A-NEXT:    s_mov_b32 s14, s12
-; GFX90A-NEXT:    s_mov_b32 s15, s13
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    s_mov_b32 s14, s6
+; GFX90A-NEXT:    s_mov_b32 s15, s7
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_1_2_2:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_6_1:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -18337,35 +17922,29 @@ define void @s_shuffle_v4p0_v4p0__7_1_2_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s14, s12
-; GFX942-NEXT:    s_mov_b32 s15, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 1, i32 2, i32 2>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 6, i32 1>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__7_3_2_2() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_3_2_2:
+define void @s_shuffle_v4p0_v4p0__u_2_2_2() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__u_2_2_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s10, s14
-; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
 ; GFX900-NEXT:    s_mov_b32 s14, s12
 ; GFX900-NEXT:    s_mov_b32 s15, s13
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -18373,19 +17952,14 @@ define void @s_shuffle_v4p0_v4p0__7_3_2_2() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_3_2_2:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__u_2_2_2:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s10, s14
-; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
 ; GFX90A-NEXT:    s_mov_b32 s14, s12
 ; GFX90A-NEXT:    s_mov_b32 s15, s13
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -18393,46 +17967,33 @@ define void @s_shuffle_v4p0_v4p0__7_3_2_2() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_3_2_2:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__u_2_2_2:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
-; GFX942-NEXT:    s_mov_b32 s14, s12
-; GFX942-NEXT:    s_mov_b32 s15, s13
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 3, i32 2, i32 2>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> <i32 poison, i32 2, i32 2, i32 2>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__7_4_2_2() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_4_2_2:
+define void @s_shuffle_v4p0_v4p0__0_2_2_2() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__0_2_2_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s10, s4
-; GFX900-NEXT:    s_mov_b32 s11, s5
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
 ; GFX900-NEXT:    s_mov_b32 s14, s12
 ; GFX900-NEXT:    s_mov_b32 s15, s13
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -18440,19 +18001,14 @@ define void @s_shuffle_v4p0_v4p0__7_4_2_2() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_4_2_2:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__0_2_2_2:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s10, s4
-; GFX90A-NEXT:    s_mov_b32 s11, s5
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
 ; GFX90A-NEXT:    s_mov_b32 s14, s12
 ; GFX90A-NEXT:    s_mov_b32 s15, s13
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -18460,113 +18016,89 @@ define void @s_shuffle_v4p0_v4p0__7_4_2_2() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_4_2_2:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__0_2_2_2:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s14, s12
-; GFX942-NEXT:    s_mov_b32 s15, s13
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 4, i32 2, i32 2>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> <i32 0, i32 2, i32 2, i32 2>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__7_5_2_2() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_5_2_2:
+define void @s_shuffle_v4p0_v4p0__1_2_2_2() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__1_2_2_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s14
-; GFX900-NEXT:    s_mov_b32 s9, s15
-; GFX900-NEXT:    s_mov_b32 s12, s16
-; GFX900-NEXT:    s_mov_b32 s13, s17
-; GFX900-NEXT:    s_mov_b32 s14, s16
-; GFX900-NEXT:    s_mov_b32 s15, s17
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_5_2_2:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__1_2_2_2:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s14
-; GFX90A-NEXT:    s_mov_b32 s9, s15
-; GFX90A-NEXT:    s_mov_b32 s12, s16
-; GFX90A-NEXT:    s_mov_b32 s13, s17
-; GFX90A-NEXT:    s_mov_b32 s14, s16
-; GFX90A-NEXT:    s_mov_b32 s15, s17
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_5_2_2:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__1_2_2_2:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s12, s4
-; GFX942-NEXT:    s_mov_b32 s13, s5
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 5, i32 2, i32 2>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> <i32 1, i32 2, i32 2, i32 2>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__7_6_2_2() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_6_2_2:
+define void @s_shuffle_v4p0_v4p0__2_2_2_2() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__2_2_2_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[16:23]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s22
-; GFX900-NEXT:    s_mov_b32 s9, s23
-; GFX900-NEXT:    s_mov_b32 s10, s20
-; GFX900-NEXT:    s_mov_b32 s11, s21
+; GFX900-NEXT:    s_mov_b32 s8, s12
+; GFX900-NEXT:    s_mov_b32 s9, s13
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
 ; GFX900-NEXT:    s_mov_b32 s14, s12
 ; GFX900-NEXT:    s_mov_b32 s15, s13
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -18574,19 +18106,16 @@ define void @s_shuffle_v4p0_v4p0__7_6_2_2() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_6_2_2:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__2_2_2_2:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[16:23]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s22
-; GFX90A-NEXT:    s_mov_b32 s9, s23
-; GFX90A-NEXT:    s_mov_b32 s10, s20
-; GFX90A-NEXT:    s_mov_b32 s11, s21
+; GFX90A-NEXT:    s_mov_b32 s8, s12
+; GFX90A-NEXT:    s_mov_b32 s9, s13
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
 ; GFX90A-NEXT:    s_mov_b32 s14, s12
 ; GFX90A-NEXT:    s_mov_b32 s15, s13
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -18594,44 +18123,36 @@ define void @s_shuffle_v4p0_v4p0__7_6_2_2() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_6_2_2:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__2_2_2_2:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s14, s12
-; GFX942-NEXT:    s_mov_b32 s15, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 6, i32 2, i32 2>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__7_7_2_2() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_2_2:
+define void @s_shuffle_v4p0_v4p0__3_2_2_2() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__3_2_2_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
 ; GFX900-NEXT:    s_mov_b32 s14, s12
 ; GFX900-NEXT:    s_mov_b32 s15, s13
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -18639,17 +18160,16 @@ define void @s_shuffle_v4p0_v4p0__7_7_2_2() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_2_2:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__3_2_2_2:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
 ; GFX90A-NEXT:    s_mov_b32 s14, s12
 ; GFX90A-NEXT:    s_mov_b32 s15, s13
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -18657,44 +18177,34 @@ define void @s_shuffle_v4p0_v4p0__7_7_2_2() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_2_2:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__3_2_2_2:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s14, s12
-; GFX942-NEXT:    s_mov_b32 s15, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 2, i32 2>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> <i32 3, i32 2, i32 2, i32 2>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__7_7_u_2() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_u_2:
+define void @s_shuffle_v4p0_v4p0__4_2_2_2() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__4_2_2_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
 ; GFX900-NEXT:    s_mov_b32 s14, s12
 ; GFX900-NEXT:    s_mov_b32 s15, s13
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -18702,17 +18212,14 @@ define void @s_shuffle_v4p0_v4p0__7_7_u_2() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_u_2:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__4_2_2_2:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
 ; GFX90A-NEXT:    s_mov_b32 s14, s12
 ; GFX90A-NEXT:    s_mov_b32 s15, s13
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -18720,70 +18227,66 @@ define void @s_shuffle_v4p0_v4p0__7_7_u_2() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_u_2:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__4_2_2_2:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 poison, i32 2>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> <i32 4, i32 2, i32 2, i32 2>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__7_7_0_2() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_0_2:
+define void @s_shuffle_v4p0_v4p0__5_2_2_2() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__5_2_2_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s14, s16
-; GFX900-NEXT:    s_mov_b32 s15, s17
+; GFX900-NEXT:    s_mov_b32 s8, s6
+; GFX900-NEXT:    s_mov_b32 s9, s7
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_0_2:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__5_2_2_2:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s14, s16
-; GFX90A-NEXT:    s_mov_b32 s15, s17
+; GFX90A-NEXT:    s_mov_b32 s8, s6
+; GFX90A-NEXT:    s_mov_b32 s9, s7
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_0_2:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__5_2_2_2:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -18792,67 +18295,58 @@ define void @s_shuffle_v4p0_v4p0__7_7_0_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 0, i32 2>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 5, i32 2, i32 2, i32 2>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__7_7_1_2() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_1_2:
+define void @s_shuffle_v4p0_v4p0__6_2_2_2() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__6_2_2_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s12, s14
-; GFX900-NEXT:    s_mov_b32 s13, s15
-; GFX900-NEXT:    s_mov_b32 s14, s16
-; GFX900-NEXT:    s_mov_b32 s15, s17
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_1_2:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__6_2_2_2:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s12, s14
-; GFX90A-NEXT:    s_mov_b32 s13, s15
-; GFX90A-NEXT:    s_mov_b32 s14, s16
-; GFX90A-NEXT:    s_mov_b32 s15, s17
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_1_2:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__6_2_2_2:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -18861,67 +18355,62 @@ define void @s_shuffle_v4p0_v4p0__7_7_1_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 1, i32 2>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 6, i32 2, i32 2, i32 2>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__7_7_3_2() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_3_2:
+define void @s_shuffle_v4p0_v4p0__7_2_2_2() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_2_2_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s10
 ; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s12, s18
-; GFX900-NEXT:    s_mov_b32 s13, s19
-; GFX900-NEXT:    s_mov_b32 s14, s16
-; GFX900-NEXT:    s_mov_b32 s15, s17
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_3_2:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_2_2_2:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s10
 ; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s12, s18
-; GFX90A-NEXT:    s_mov_b32 s13, s19
-; GFX90A-NEXT:    s_mov_b32 s14, s16
-; GFX90A-NEXT:    s_mov_b32 s15, s17
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_3_2:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_2_2_2:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -18930,94 +18419,81 @@ define void @s_shuffle_v4p0_v4p0__7_7_3_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 3, i32 2>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 2, i32 2, i32 2>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__7_7_4_2() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_4_2:
+define void @s_shuffle_v4p0_v4p0__7_u_2_2() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_u_2_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s10
 ; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s12, s4
-; GFX900-NEXT:    s_mov_b32 s13, s5
-; GFX900-NEXT:    s_mov_b32 s14, s16
-; GFX900-NEXT:    s_mov_b32 s15, s17
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_4_2:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_u_2_2:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s10
 ; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s12, s4
-; GFX90A-NEXT:    s_mov_b32 s13, s5
-; GFX90A-NEXT:    s_mov_b32 s14, s16
-; GFX90A-NEXT:    s_mov_b32 s15, s17
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_4_2:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_u_2_2:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[12:19]
+; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s18
-; GFX942-NEXT:    s_mov_b32 s9, s19
-; GFX942-NEXT:    s_mov_b32 s10, s18
-; GFX942-NEXT:    s_mov_b32 s11, s19
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 4, i32 2>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 poison, i32 2, i32 2>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__7_7_5_2() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_5_2:
+define void @s_shuffle_v4p0_v4p0__7_0_2_2() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_0_2_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -19028,8 +18504,10 @@ define void @s_shuffle_v4p0_v4p0__7_7_5_2() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s10
 ; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s12, s6
-; GFX900-NEXT:    s_mov_b32 s13, s7
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s12, s16
+; GFX900-NEXT:    s_mov_b32 s13, s17
 ; GFX900-NEXT:    s_mov_b32 s14, s16
 ; GFX900-NEXT:    s_mov_b32 s15, s17
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -19037,7 +18515,7 @@ define void @s_shuffle_v4p0_v4p0__7_7_5_2() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_5_2:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_0_2_2:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -19048,8 +18526,10 @@ define void @s_shuffle_v4p0_v4p0__7_7_5_2() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s10
 ; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s12, s6
-; GFX90A-NEXT:    s_mov_b32 s13, s7
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s12, s16
+; GFX90A-NEXT:    s_mov_b32 s13, s17
 ; GFX90A-NEXT:    s_mov_b32 s14, s16
 ; GFX90A-NEXT:    s_mov_b32 s15, s17
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -19057,76 +18537,68 @@ define void @s_shuffle_v4p0_v4p0__7_7_5_2() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_5_2:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_0_2_2:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[12:19]
+; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s18
-; GFX942-NEXT:    s_mov_b32 s9, s19
-; GFX942-NEXT:    s_mov_b32 s10, s18
-; GFX942-NEXT:    s_mov_b32 s11, s19
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 5, i32 2>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 0, i32 2, i32 2>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__7_7_6_2() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_6_2:
+define void @s_shuffle_v4p0_v4p0__7_1_2_2() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_1_2_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ; def s[16:23]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s14
-; GFX900-NEXT:    s_mov_b32 s9, s15
-; GFX900-NEXT:    s_mov_b32 s10, s14
-; GFX900-NEXT:    s_mov_b32 s11, s15
-; GFX900-NEXT:    s_mov_b32 s14, s16
-; GFX900-NEXT:    s_mov_b32 s15, s17
+; GFX900-NEXT:    s_mov_b32 s8, s22
+; GFX900-NEXT:    s_mov_b32 s9, s23
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_6_2:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_1_2_2:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ; def s[16:23]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s14
-; GFX90A-NEXT:    s_mov_b32 s9, s15
-; GFX90A-NEXT:    s_mov_b32 s10, s14
-; GFX90A-NEXT:    s_mov_b32 s11, s15
-; GFX90A-NEXT:    s_mov_b32 s14, s16
-; GFX90A-NEXT:    s_mov_b32 s15, s17
+; GFX90A-NEXT:    s_mov_b32 s8, s22
+; GFX90A-NEXT:    s_mov_b32 s9, s23
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_6_2:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_1_2_2:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -19135,157 +18607,21 @@ define void @s_shuffle_v4p0_v4p0__7_7_6_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 6, i32 2>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4p0_v4p0__u_3_3_3() {
-; GFX9-LABEL: s_shuffle_v4p0_v4p0__u_3_3_3:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s14
-; GFX9-NEXT:    s_mov_b32 s11, s15
-; GFX9-NEXT:    s_mov_b32 s12, s14
-; GFX9-NEXT:    s_mov_b32 s13, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> <i32 poison, i32 3, i32 3, i32 3>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4p0_v4p0__0_3_3_3() {
-; GFX9-LABEL: s_shuffle_v4p0_v4p0__0_3_3_3:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s14
-; GFX9-NEXT:    s_mov_b32 s11, s15
-; GFX9-NEXT:    s_mov_b32 s12, s14
-; GFX9-NEXT:    s_mov_b32 s13, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> <i32 0, i32 3, i32 3, i32 3>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4p0_v4p0__1_3_3_3() {
-; GFX9-LABEL: s_shuffle_v4p0_v4p0__1_3_3_3:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    s_mov_b32 s10, s14
-; GFX9-NEXT:    s_mov_b32 s11, s15
-; GFX9-NEXT:    s_mov_b32 s12, s14
-; GFX9-NEXT:    s_mov_b32 s13, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> <i32 1, i32 3, i32 3, i32 3>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4p0_v4p0__2_3_3_3() {
-; GFX9-LABEL: s_shuffle_v4p0_v4p0__2_3_3_3:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s12
-; GFX9-NEXT:    s_mov_b32 s9, s13
-; GFX9-NEXT:    s_mov_b32 s10, s14
-; GFX9-NEXT:    s_mov_b32 s11, s15
-; GFX9-NEXT:    s_mov_b32 s12, s14
-; GFX9-NEXT:    s_mov_b32 s13, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> <i32 2, i32 3, i32 3, i32 3>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4p0_v4p0__3_3_3_3() {
-; GFX9-LABEL: s_shuffle_v4p0_v4p0__3_3_3_3:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    s_mov_b32 s10, s14
-; GFX9-NEXT:    s_mov_b32 s11, s15
-; GFX9-NEXT:    s_mov_b32 s12, s14
-; GFX9-NEXT:    s_mov_b32 s13, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4p0_v4p0__4_3_3_3() {
-; GFX9-LABEL: s_shuffle_v4p0_v4p0__4_3_3_3:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s14
-; GFX9-NEXT:    s_mov_b32 s11, s15
-; GFX9-NEXT:    s_mov_b32 s12, s14
-; GFX9-NEXT:    s_mov_b32 s13, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> <i32 4, i32 3, i32 3, i32 3>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 1, i32 2, i32 2>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__5_3_3_3() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__5_3_3_3:
+define void @s_shuffle_v4p0_v4p0__7_3_2_2() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_3_2_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -19294,18 +18630,18 @@ define void @s_shuffle_v4p0_v4p0__5_3_3_3() {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s6
-; GFX900-NEXT:    s_mov_b32 s9, s7
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
 ; GFX900-NEXT:    s_mov_b32 s10, s14
 ; GFX900-NEXT:    s_mov_b32 s11, s15
-; GFX900-NEXT:    s_mov_b32 s12, s14
-; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__5_3_3_3:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_3_2_2:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -19314,18 +18650,18 @@ define void @s_shuffle_v4p0_v4p0__5_3_3_3() {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s6
-; GFX90A-NEXT:    s_mov_b32 s9, s7
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
 ; GFX90A-NEXT:    s_mov_b32 s10, s14
 ; GFX90A-NEXT:    s_mov_b32 s11, s15
-; GFX90A-NEXT:    s_mov_b32 s12, s14
-; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__5_3_3_3:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_3_2_2:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -19334,25 +18670,22 @@ define void @s_shuffle_v4p0_v4p0__5_3_3_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 5, i32 3, i32 3, i32 3>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 3, i32 2, i32 2>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__6_3_3_3() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__6_3_3_3:
+define void @s_shuffle_v4p0_v4p0__7_4_2_2() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_4_2_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -19361,16 +18694,18 @@ define void @s_shuffle_v4p0_v4p0__6_3_3_3() {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s10, s14
-; GFX900-NEXT:    s_mov_b32 s11, s15
-; GFX900-NEXT:    s_mov_b32 s12, s14
-; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s10, s4
+; GFX900-NEXT:    s_mov_b32 s11, s5
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__6_3_3_3:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_4_2_2:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -19379,16 +18714,18 @@ define void @s_shuffle_v4p0_v4p0__6_3_3_3() {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s10, s14
-; GFX90A-NEXT:    s_mov_b32 s11, s15
-; GFX90A-NEXT:    s_mov_b32 s12, s14
-; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s10, s4
+; GFX90A-NEXT:    s_mov_b32 s11, s5
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__6_3_3_3:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_4_2_2:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -19397,65 +18734,62 @@ define void @s_shuffle_v4p0_v4p0__6_3_3_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 6, i32 3, i32 3, i32 3>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 4, i32 2, i32 2>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__7_3_3_3() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_3_3_3:
+define void @s_shuffle_v4p0_v4p0__7_5_2_2() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_5_2_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s10, s14
-; GFX900-NEXT:    s_mov_b32 s11, s15
-; GFX900-NEXT:    s_mov_b32 s12, s14
-; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s12, s16
+; GFX900-NEXT:    s_mov_b32 s13, s17
+; GFX900-NEXT:    s_mov_b32 s14, s16
+; GFX900-NEXT:    s_mov_b32 s15, s17
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_3_3_3:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_5_2_2:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s10, s14
-; GFX90A-NEXT:    s_mov_b32 s11, s15
-; GFX90A-NEXT:    s_mov_b32 s12, s14
-; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s16
+; GFX90A-NEXT:    s_mov_b32 s13, s17
+; GFX90A-NEXT:    s_mov_b32 s14, s16
+; GFX90A-NEXT:    s_mov_b32 s15, s17
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_3_3_3:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_5_2_2:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -19464,61 +18798,62 @@ define void @s_shuffle_v4p0_v4p0__7_3_3_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 3, i32 3, i32 3>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 5, i32 2, i32 2>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__7_u_3_3() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_u_3_3:
+define void @s_shuffle_v4p0_v4p0__7_6_2_2() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_6_2_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ; def s[16:23]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s12, s14
-; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    s_mov_b32 s8, s22
+; GFX900-NEXT:    s_mov_b32 s9, s23
+; GFX900-NEXT:    s_mov_b32 s10, s20
+; GFX900-NEXT:    s_mov_b32 s11, s21
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_u_3_3:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_6_2_2:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; def s[16:23]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s12, s14
-; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    s_mov_b32 s8, s22
+; GFX90A-NEXT:    s_mov_b32 s9, s23
+; GFX90A-NEXT:    s_mov_b32 s10, s20
+; GFX90A-NEXT:    s_mov_b32 s11, s21
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_u_3_3:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_6_2_2:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -19527,67 +18862,58 @@ define void @s_shuffle_v4p0_v4p0__7_u_3_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 poison, i32 3, i32 3>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 6, i32 2, i32 2>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__7_0_3_3() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_0_3_3:
+define void @s_shuffle_v4p0_v4p0__7_7_2_2() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_2_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s10
 ; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s10, s12
-; GFX900-NEXT:    s_mov_b32 s11, s13
-; GFX900-NEXT:    s_mov_b32 s12, s18
-; GFX900-NEXT:    s_mov_b32 s13, s19
-; GFX900-NEXT:    s_mov_b32 s14, s18
-; GFX900-NEXT:    s_mov_b32 s15, s19
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_0_3_3:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_2_2:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s10
 ; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s10, s12
-; GFX90A-NEXT:    s_mov_b32 s11, s13
-; GFX90A-NEXT:    s_mov_b32 s12, s18
-; GFX90A-NEXT:    s_mov_b32 s13, s19
-; GFX90A-NEXT:    s_mov_b32 s14, s18
-; GFX90A-NEXT:    s_mov_b32 s15, s19
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_0_3_3:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_2_2:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -19596,63 +18922,58 @@ define void @s_shuffle_v4p0_v4p0__7_0_3_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
-; GFX942-NEXT:    s_mov_b32 s14, s6
-; GFX942-NEXT:    s_mov_b32 s15, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 0, i32 3, i32 3>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 2, i32 2>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__7_1_3_3() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_1_3_3:
+define void @s_shuffle_v4p0_v4p0__7_7_u_2() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_u_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[16:23]
+; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s22
-; GFX900-NEXT:    s_mov_b32 s9, s23
-; GFX900-NEXT:    s_mov_b32 s12, s14
-; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_1_3_3:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_u_2:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[16:23]
+; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s22
-; GFX90A-NEXT:    s_mov_b32 s9, s23
-; GFX90A-NEXT:    s_mov_b32 s12, s14
-; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_1_3_3:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_u_2:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -19661,63 +18982,58 @@ define void @s_shuffle_v4p0_v4p0__7_1_3_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 1, i32 3, i32 3>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 poison, i32 2>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__7_2_3_3() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_2_3_3:
+define void @s_shuffle_v4p0_v4p0__7_7_0_2() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_0_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s10
 ; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s10, s12
-; GFX900-NEXT:    s_mov_b32 s11, s13
-; GFX900-NEXT:    s_mov_b32 s12, s14
-; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    s_mov_b32 s14, s16
+; GFX900-NEXT:    s_mov_b32 s15, s17
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_2_3_3:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_0_2:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s10
 ; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s10, s12
-; GFX90A-NEXT:    s_mov_b32 s11, s13
-; GFX90A-NEXT:    s_mov_b32 s12, s14
-; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    s_mov_b32 s14, s16
+; GFX90A-NEXT:    s_mov_b32 s15, s17
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_2_3_3:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_0_2:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -19726,65 +19042,63 @@ define void @s_shuffle_v4p0_v4p0__7_2_3_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 2, i32 3, i32 3>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 0, i32 2>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__7_4_3_3() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_4_3_3:
+define void @s_shuffle_v4p0_v4p0__7_7_1_2() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_1_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s10
 ; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s10, s4
-; GFX900-NEXT:    s_mov_b32 s11, s5
 ; GFX900-NEXT:    s_mov_b32 s12, s14
 ; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    s_mov_b32 s14, s16
+; GFX900-NEXT:    s_mov_b32 s15, s17
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_4_3_3:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_1_2:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s10
 ; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s10, s4
-; GFX90A-NEXT:    s_mov_b32 s11, s5
 ; GFX90A-NEXT:    s_mov_b32 s12, s14
 ; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    s_mov_b32 s14, s16
+; GFX90A-NEXT:    s_mov_b32 s15, s17
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_4_3_3:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_1_2:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -19793,65 +19107,63 @@ define void @s_shuffle_v4p0_v4p0__7_4_3_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 4, i32 3, i32 3>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 1, i32 2>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__7_5_3_3() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_5_3_3:
+define void @s_shuffle_v4p0_v4p0__7_7_3_2() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_3_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s14
-; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
 ; GFX900-NEXT:    s_mov_b32 s12, s18
 ; GFX900-NEXT:    s_mov_b32 s13, s19
-; GFX900-NEXT:    s_mov_b32 s14, s18
-; GFX900-NEXT:    s_mov_b32 s15, s19
+; GFX900-NEXT:    s_mov_b32 s14, s16
+; GFX900-NEXT:    s_mov_b32 s15, s17
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_5_3_3:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_3_2:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s14
-; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
 ; GFX90A-NEXT:    s_mov_b32 s12, s18
 ; GFX90A-NEXT:    s_mov_b32 s13, s19
-; GFX90A-NEXT:    s_mov_b32 s14, s18
-; GFX90A-NEXT:    s_mov_b32 s15, s19
+; GFX90A-NEXT:    s_mov_b32 s14, s16
+; GFX90A-NEXT:    s_mov_b32 s15, s17
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_5_3_3:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_3_2:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -19860,187 +19172,192 @@ define void @s_shuffle_v4p0_v4p0__7_5_3_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
-; GFX942-NEXT:    s_mov_b32 s14, s6
-; GFX942-NEXT:    s_mov_b32 s15, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 5, i32 3, i32 3>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 3, i32 2>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__7_6_3_3() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_6_3_3:
+define void @s_shuffle_v4p0_v4p0__7_7_4_2() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_4_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[16:23]
+; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s22
-; GFX900-NEXT:    s_mov_b32 s9, s23
-; GFX900-NEXT:    s_mov_b32 s10, s20
-; GFX900-NEXT:    s_mov_b32 s11, s21
-; GFX900-NEXT:    s_mov_b32 s12, s14
-; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s12, s4
+; GFX900-NEXT:    s_mov_b32 s13, s5
+; GFX900-NEXT:    s_mov_b32 s14, s16
+; GFX900-NEXT:    s_mov_b32 s15, s17
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_6_3_3:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_4_2:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[16:23]
+; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s22
-; GFX90A-NEXT:    s_mov_b32 s9, s23
-; GFX90A-NEXT:    s_mov_b32 s10, s20
-; GFX90A-NEXT:    s_mov_b32 s11, s21
-; GFX90A-NEXT:    s_mov_b32 s12, s14
-; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s12, s4
+; GFX90A-NEXT:    s_mov_b32 s13, s5
+; GFX90A-NEXT:    s_mov_b32 s14, s16
+; GFX90A-NEXT:    s_mov_b32 s15, s17
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_6_3_3:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_4_2:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ; def s[12:19]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[18:19]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[18:19]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 6, i32 3, i32 3>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 4, i32 2>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__7_7_3_3() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_3_3:
+define void @s_shuffle_v4p0_v4p0__7_7_5_2() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_5_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s10
 ; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s12, s14
-; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    s_mov_b32 s12, s6
+; GFX900-NEXT:    s_mov_b32 s13, s7
+; GFX900-NEXT:    s_mov_b32 s14, s16
+; GFX900-NEXT:    s_mov_b32 s15, s17
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_3_3:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_5_2:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s10
 ; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s12, s14
-; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s6
+; GFX90A-NEXT:    s_mov_b32 s13, s7
+; GFX90A-NEXT:    s_mov_b32 s14, s16
+; GFX90A-NEXT:    s_mov_b32 s15, s17
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_3_3:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_5_2:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ; def s[12:19]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[18:19]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[18:19]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 3, i32 3>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 5, i32 2>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__7_7_u_3() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_u_3:
+define void @s_shuffle_v4p0_v4p0__7_7_6_2() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_6_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    s_mov_b32 s14, s16
+; GFX900-NEXT:    s_mov_b32 s15, s17
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_u_3:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_6_2:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    s_mov_b32 s14, s16
+; GFX90A-NEXT:    s_mov_b32 s15, s17
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_u_3:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_6_2:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -20049,926 +19366,898 @@ define void @s_shuffle_v4p0_v4p0__7_7_u_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 poison, i32 3>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 6, i32 2>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__7_7_0_3() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_0_3:
+define void @s_shuffle_v4p0_v4p0__u_3_3_3() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__u_3_3_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s14, s18
-; GFX900-NEXT:    s_mov_b32 s15, s19
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_0_3:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__u_3_3_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s14, s18
-; GFX90A-NEXT:    s_mov_b32 s15, s19
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_0_3:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__u_3_3_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s6
-; GFX942-NEXT:    s_mov_b32 s15, s7
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 0, i32 3>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> <i32 poison, i32 3, i32 3, i32 3>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__7_7_1_3() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_1_3:
+define void @s_shuffle_v4p0_v4p0__0_3_3_3() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__0_3_3_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
 ; GFX900-NEXT:    s_mov_b32 s12, s14
 ; GFX900-NEXT:    s_mov_b32 s13, s15
-; GFX900-NEXT:    s_mov_b32 s14, s18
-; GFX900-NEXT:    s_mov_b32 s15, s19
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_1_3:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__0_3_3_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
 ; GFX90A-NEXT:    s_mov_b32 s12, s14
 ; GFX90A-NEXT:    s_mov_b32 s13, s15
-; GFX90A-NEXT:    s_mov_b32 s14, s18
-; GFX90A-NEXT:    s_mov_b32 s15, s19
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_1_3:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__0_3_3_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s6
-; GFX942-NEXT:    s_mov_b32 s15, s7
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 1, i32 3>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> <i32 0, i32 3, i32 3, i32 3>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__7_7_2_3() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_2_3:
+define void @s_shuffle_v4p0_v4p0__1_3_3_3() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__1_3_3_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
-; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s10
 ; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_2_3:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__1_3_3_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
-; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s10
 ; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_2_3:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__1_3_3_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 2, i32 3>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> <i32 1, i32 3, i32 3, i32 3>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__7_7_4_3() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_4_3:
+define void @s_shuffle_v4p0_v4p0__2_3_3_3() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__2_3_3_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s12, s4
-; GFX900-NEXT:    s_mov_b32 s13, s5
+; GFX900-NEXT:    s_mov_b32 s8, s12
+; GFX900-NEXT:    s_mov_b32 s9, s13
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_4_3:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__2_3_3_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s12, s4
-; GFX90A-NEXT:    s_mov_b32 s13, s5
+; GFX90A-NEXT:    s_mov_b32 s8, s12
+; GFX90A-NEXT:    s_mov_b32 s9, s13
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_4_3:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__2_3_3_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 4, i32 3>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> <i32 2, i32 3, i32 3, i32 3>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__7_7_5_3() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_5_3:
+define void @s_shuffle_v4p0_v4p0__3_3_3_3() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__3_3_3_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s12, s6
-; GFX900-NEXT:    s_mov_b32 s13, s7
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_5_3:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__3_3_3_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s12, s6
-; GFX90A-NEXT:    s_mov_b32 s13, s7
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_5_3:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__3_3_3_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 5, i32 3>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__7_7_6_3() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_6_3:
+define void @s_shuffle_v4p0_v4p0__4_3_3_3() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__4_3_3_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s14
-; GFX900-NEXT:    s_mov_b32 s9, s15
 ; GFX900-NEXT:    s_mov_b32 s10, s14
 ; GFX900-NEXT:    s_mov_b32 s11, s15
-; GFX900-NEXT:    s_mov_b32 s14, s18
-; GFX900-NEXT:    s_mov_b32 s15, s19
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_6_3:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__4_3_3_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s14
-; GFX90A-NEXT:    s_mov_b32 s9, s15
 ; GFX90A-NEXT:    s_mov_b32 s10, s14
 ; GFX90A-NEXT:    s_mov_b32 s11, s15
-; GFX90A-NEXT:    s_mov_b32 s14, s18
-; GFX90A-NEXT:    s_mov_b32 s15, s19
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_6_3:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__4_3_3_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
-; GFX942-NEXT:    s_mov_b32 s14, s6
-; GFX942-NEXT:    s_mov_b32 s15, s7
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 6, i32 3>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4p0_v4p0__u_4_4_4() {
-; GFX9-LABEL: s_shuffle_v4p0_v4p0__u_4_4_4:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> <i32 poison, i32 4, i32 4, i32 4>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> <i32 4, i32 3, i32 3, i32 3>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__0_4_4_4() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__0_4_4_4:
+define void @s_shuffle_v4p0_v4p0__5_3_3_3() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__5_3_3_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s6
+; GFX900-NEXT:    s_mov_b32 s9, s7
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__0_4_4_4:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__5_3_3_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s6
+; GFX90A-NEXT:    s_mov_b32 s9, s7
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__0_4_4_4:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__5_3_3_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> <i32 0, i32 4, i32 4, i32 4>
+  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 5, i32 3, i32 3, i32 3>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__1_4_4_4() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__1_4_4_4:
+define void @s_shuffle_v4p0_v4p0__6_3_3_3() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__6_3_3_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s6
-; GFX900-NEXT:    s_mov_b32 s9, s7
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__1_4_4_4:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__6_3_3_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s6
-; GFX90A-NEXT:    s_mov_b32 s9, s7
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__1_4_4_4:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__6_3_3_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> <i32 1, i32 4, i32 4, i32 4>
+  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 6, i32 3, i32 3, i32 3>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__2_4_4_4() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__2_4_4_4:
+define void @s_shuffle_v4p0_v4p0__7_3_3_3() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_3_3_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__2_4_4_4:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_3_3_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__2_4_4_4:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_3_3_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> <i32 2, i32 4, i32 4, i32 4>
+  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 3, i32 3, i32 3>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__3_4_4_4() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__3_4_4_4:
+define void @s_shuffle_v4p0_v4p0__7_u_3_3() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_u_3_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s10
 ; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__3_4_4_4:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_u_3_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s10
 ; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__3_4_4_4:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_u_3_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> <i32 3, i32 4, i32 4, i32 4>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4p0_v4p0__4_4_4_4() {
-; GFX9-LABEL: s_shuffle_v4p0_v4p0__4_4_4_4:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> <i32 4, i32 4, i32 4, i32 4>
+  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 poison, i32 3, i32 3>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__5_4_4_4() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__5_4_4_4:
+define void @s_shuffle_v4p0_v4p0__7_0_3_3() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_0_3_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s6
-; GFX900-NEXT:    s_mov_b32 s9, s7
-; GFX900-NEXT:    s_mov_b32 s10, s4
-; GFX900-NEXT:    s_mov_b32 s11, s5
-; GFX900-NEXT:    s_mov_b32 s12, s4
-; GFX900-NEXT:    s_mov_b32 s13, s5
-; GFX900-NEXT:    s_mov_b32 s14, s4
-; GFX900-NEXT:    s_mov_b32 s15, s5
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s12, s18
+; GFX900-NEXT:    s_mov_b32 s13, s19
+; GFX900-NEXT:    s_mov_b32 s14, s18
+; GFX900-NEXT:    s_mov_b32 s15, s19
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__5_4_4_4:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_0_3_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s6
-; GFX90A-NEXT:    s_mov_b32 s9, s7
-; GFX90A-NEXT:    s_mov_b32 s10, s4
-; GFX90A-NEXT:    s_mov_b32 s11, s5
-; GFX90A-NEXT:    s_mov_b32 s12, s4
-; GFX90A-NEXT:    s_mov_b32 s13, s5
-; GFX90A-NEXT:    s_mov_b32 s14, s4
-; GFX90A-NEXT:    s_mov_b32 s15, s5
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s12, s18
+; GFX90A-NEXT:    s_mov_b32 s13, s19
+; GFX90A-NEXT:    s_mov_b32 s14, s18
+; GFX90A-NEXT:    s_mov_b32 s15, s19
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__5_4_4_4:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_0_3_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 5, i32 4, i32 4, i32 4>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 0, i32 3, i32 3>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__6_4_4_4() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__6_4_4_4:
+define void @s_shuffle_v4p0_v4p0__7_1_3_3() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_1_3_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s10, s4
-; GFX900-NEXT:    s_mov_b32 s11, s5
-; GFX900-NEXT:    s_mov_b32 s12, s4
-; GFX900-NEXT:    s_mov_b32 s13, s5
-; GFX900-NEXT:    s_mov_b32 s14, s4
-; GFX900-NEXT:    s_mov_b32 s15, s5
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[16:23]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s22
+; GFX900-NEXT:    s_mov_b32 s9, s23
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__6_4_4_4:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_1_3_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s10, s4
-; GFX90A-NEXT:    s_mov_b32 s11, s5
-; GFX90A-NEXT:    s_mov_b32 s12, s4
-; GFX90A-NEXT:    s_mov_b32 s13, s5
-; GFX90A-NEXT:    s_mov_b32 s14, s4
-; GFX90A-NEXT:    s_mov_b32 s15, s5
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[16:23]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s22
+; GFX90A-NEXT:    s_mov_b32 s9, s23
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__6_4_4_4:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_1_3_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 6, i32 4, i32 4, i32 4>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 1, i32 3, i32 3>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__7_4_4_4() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_4_4_4:
+define void @s_shuffle_v4p0_v4p0__7_2_3_3() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_2_3_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s10
 ; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s10, s4
-; GFX900-NEXT:    s_mov_b32 s11, s5
-; GFX900-NEXT:    s_mov_b32 s12, s4
-; GFX900-NEXT:    s_mov_b32 s13, s5
-; GFX900-NEXT:    s_mov_b32 s14, s4
-; GFX900-NEXT:    s_mov_b32 s15, s5
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_4_4_4:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_2_3_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s10
 ; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s10, s4
-; GFX90A-NEXT:    s_mov_b32 s11, s5
-; GFX90A-NEXT:    s_mov_b32 s12, s4
-; GFX90A-NEXT:    s_mov_b32 s13, s5
-; GFX90A-NEXT:    s_mov_b32 s14, s4
-; GFX90A-NEXT:    s_mov_b32 s15, s5
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_4_4_4:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_2_3_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 4, i32 4, i32 4>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 2, i32 3, i32 3>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__7_u_4_4() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_u_4_4:
+define void @s_shuffle_v4p0_v4p0__7_4_3_3() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_4_3_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s10
 ; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s12, s4
-; GFX900-NEXT:    s_mov_b32 s13, s5
-; GFX900-NEXT:    s_mov_b32 s14, s4
-; GFX900-NEXT:    s_mov_b32 s15, s5
+; GFX900-NEXT:    s_mov_b32 s10, s4
+; GFX900-NEXT:    s_mov_b32 s11, s5
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_u_4_4:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_4_3_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s10
 ; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s12, s4
-; GFX90A-NEXT:    s_mov_b32 s13, s5
-; GFX90A-NEXT:    s_mov_b32 s14, s4
-; GFX90A-NEXT:    s_mov_b32 s15, s5
+; GFX90A-NEXT:    s_mov_b32 s10, s4
+; GFX90A-NEXT:    s_mov_b32 s11, s5
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_u_4_4:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_4_3_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 poison, i32 4, i32 4>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 4, i32 3, i32 3>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__7_0_4_4() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_0_4_4:
+define void @s_shuffle_v4p0_v4p0__7_5_3_3() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_5_3_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[16:23]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s22
-; GFX900-NEXT:    s_mov_b32 s9, s23
-; GFX900-NEXT:    s_mov_b32 s10, s4
-; GFX900-NEXT:    s_mov_b32 s11, s5
-; GFX900-NEXT:    s_mov_b32 s12, s16
-; GFX900-NEXT:    s_mov_b32 s13, s17
-; GFX900-NEXT:    s_mov_b32 s14, s16
-; GFX900-NEXT:    s_mov_b32 s15, s17
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s12, s18
+; GFX900-NEXT:    s_mov_b32 s13, s19
+; GFX900-NEXT:    s_mov_b32 s14, s18
+; GFX900-NEXT:    s_mov_b32 s15, s19
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_0_4_4:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_5_3_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[16:23]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s22
-; GFX90A-NEXT:    s_mov_b32 s9, s23
-; GFX90A-NEXT:    s_mov_b32 s10, s4
-; GFX90A-NEXT:    s_mov_b32 s11, s5
-; GFX90A-NEXT:    s_mov_b32 s12, s16
-; GFX90A-NEXT:    s_mov_b32 s13, s17
-; GFX90A-NEXT:    s_mov_b32 s14, s16
-; GFX90A-NEXT:    s_mov_b32 s15, s17
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s18
+; GFX90A-NEXT:    s_mov_b32 s13, s19
+; GFX90A-NEXT:    s_mov_b32 s14, s18
+; GFX90A-NEXT:    s_mov_b32 s15, s19
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_0_4_4:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_5_3_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[4:11]
+; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s4
-; GFX942-NEXT:    s_mov_b32 s13, s5
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 0, i32 4, i32 4>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 5, i32 3, i32 3>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__7_1_4_4() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_1_4_4:
+define void @s_shuffle_v4p0_v4p0__7_6_3_3() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_6_3_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -20979,16 +20268,16 @@ define void @s_shuffle_v4p0_v4p0__7_1_4_4() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s22
 ; GFX900-NEXT:    s_mov_b32 s9, s23
-; GFX900-NEXT:    s_mov_b32 s12, s16
-; GFX900-NEXT:    s_mov_b32 s13, s17
-; GFX900-NEXT:    s_mov_b32 s14, s16
-; GFX900-NEXT:    s_mov_b32 s15, s17
+; GFX900-NEXT:    s_mov_b32 s10, s20
+; GFX900-NEXT:    s_mov_b32 s11, s21
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_1_4_4:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_6_3_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -20999,16 +20288,16 @@ define void @s_shuffle_v4p0_v4p0__7_1_4_4() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s22
 ; GFX90A-NEXT:    s_mov_b32 s9, s23
-; GFX90A-NEXT:    s_mov_b32 s12, s16
-; GFX90A-NEXT:    s_mov_b32 s13, s17
-; GFX90A-NEXT:    s_mov_b32 s14, s16
-; GFX90A-NEXT:    s_mov_b32 s15, s17
+; GFX90A-NEXT:    s_mov_b32 s10, s20
+; GFX90A-NEXT:    s_mov_b32 s11, s21
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_1_4_4:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_6_3_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -21017,25 +20306,22 @@ define void @s_shuffle_v4p0_v4p0__7_1_4_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 1, i32 4, i32 4>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 6, i32 3, i32 3>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__7_2_4_4() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_2_4_4:
+define void @s_shuffle_v4p0_v4p0__7_7_3_3() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_3_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -21046,18 +20332,14 @@ define void @s_shuffle_v4p0_v4p0__7_2_4_4() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s10
 ; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s10, s12
-; GFX900-NEXT:    s_mov_b32 s11, s13
-; GFX900-NEXT:    s_mov_b32 s12, s4
-; GFX900-NEXT:    s_mov_b32 s13, s5
-; GFX900-NEXT:    s_mov_b32 s14, s4
-; GFX900-NEXT:    s_mov_b32 s15, s5
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_2_4_4:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_3_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -21068,649 +20350,609 @@ define void @s_shuffle_v4p0_v4p0__7_2_4_4() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s10
 ; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s10, s12
-; GFX90A-NEXT:    s_mov_b32 s11, s13
-; GFX90A-NEXT:    s_mov_b32 s12, s4
-; GFX90A-NEXT:    s_mov_b32 s13, s5
-; GFX90A-NEXT:    s_mov_b32 s14, s4
-; GFX90A-NEXT:    s_mov_b32 s15, s5
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_2_4_4:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_3_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[16:23]
+; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s22
-; GFX942-NEXT:    s_mov_b32 s9, s23
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s16
-; GFX942-NEXT:    s_mov_b32 s13, s17
-; GFX942-NEXT:    s_mov_b32 s14, s16
-; GFX942-NEXT:    s_mov_b32 s15, s17
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 2, i32 4, i32 4>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 3, i32 3>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__7_3_4_4() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_3_4_4:
+define void @s_shuffle_v4p0_v4p0__7_7_u_3() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_u_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[16:23]
+; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s22
-; GFX900-NEXT:    s_mov_b32 s9, s23
-; GFX900-NEXT:    s_mov_b32 s12, s16
-; GFX900-NEXT:    s_mov_b32 s13, s17
-; GFX900-NEXT:    s_mov_b32 s14, s16
-; GFX900-NEXT:    s_mov_b32 s15, s17
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_3_4_4:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_u_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[16:23]
+; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s22
-; GFX90A-NEXT:    s_mov_b32 s9, s23
-; GFX90A-NEXT:    s_mov_b32 s12, s16
-; GFX90A-NEXT:    s_mov_b32 s13, s17
-; GFX90A-NEXT:    s_mov_b32 s14, s16
-; GFX90A-NEXT:    s_mov_b32 s15, s17
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_3_4_4:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_u_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[16:23]
+; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s22
-; GFX942-NEXT:    s_mov_b32 s9, s23
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s16
-; GFX942-NEXT:    s_mov_b32 s13, s17
-; GFX942-NEXT:    s_mov_b32 s14, s16
-; GFX942-NEXT:    s_mov_b32 s15, s17
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 3, i32 4, i32 4>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 poison, i32 3>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__7_5_4_4() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_5_4_4:
+define void @s_shuffle_v4p0_v4p0__7_7_0_3() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_0_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s10
 ; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s10, s6
-; GFX900-NEXT:    s_mov_b32 s11, s7
-; GFX900-NEXT:    s_mov_b32 s12, s4
-; GFX900-NEXT:    s_mov_b32 s13, s5
-; GFX900-NEXT:    s_mov_b32 s14, s4
-; GFX900-NEXT:    s_mov_b32 s15, s5
+; GFX900-NEXT:    s_mov_b32 s14, s18
+; GFX900-NEXT:    s_mov_b32 s15, s19
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_5_4_4:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_0_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s10
 ; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s10, s6
-; GFX90A-NEXT:    s_mov_b32 s11, s7
-; GFX90A-NEXT:    s_mov_b32 s12, s4
-; GFX90A-NEXT:    s_mov_b32 s13, s5
-; GFX90A-NEXT:    s_mov_b32 s14, s4
-; GFX90A-NEXT:    s_mov_b32 s15, s5
+; GFX90A-NEXT:    s_mov_b32 s14, s18
+; GFX90A-NEXT:    s_mov_b32 s15, s19
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_5_4_4:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_0_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s2
-; GFX942-NEXT:    s_mov_b32 s11, s3
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 5, i32 4, i32 4>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 0, i32 3>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__7_6_4_4() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_6_4_4:
+define void @s_shuffle_v4p0_v4p0__7_7_1_3() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_1_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[16:23]
+; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s22
-; GFX900-NEXT:    s_mov_b32 s9, s23
-; GFX900-NEXT:    s_mov_b32 s10, s20
-; GFX900-NEXT:    s_mov_b32 s11, s21
-; GFX900-NEXT:    s_mov_b32 s12, s16
-; GFX900-NEXT:    s_mov_b32 s13, s17
-; GFX900-NEXT:    s_mov_b32 s14, s16
-; GFX900-NEXT:    s_mov_b32 s15, s17
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    s_mov_b32 s14, s18
+; GFX900-NEXT:    s_mov_b32 s15, s19
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_6_4_4:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_1_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[16:23]
+; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s22
-; GFX90A-NEXT:    s_mov_b32 s9, s23
-; GFX90A-NEXT:    s_mov_b32 s10, s20
-; GFX90A-NEXT:    s_mov_b32 s11, s21
-; GFX90A-NEXT:    s_mov_b32 s12, s16
-; GFX90A-NEXT:    s_mov_b32 s13, s17
-; GFX90A-NEXT:    s_mov_b32 s14, s16
-; GFX90A-NEXT:    s_mov_b32 s15, s17
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    s_mov_b32 s14, s18
+; GFX90A-NEXT:    s_mov_b32 s15, s19
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_6_4_4:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_1_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 6, i32 4, i32 4>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 1, i32 3>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__7_7_4_4() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_4_4:
+define void @s_shuffle_v4p0_v4p0__7_7_2_3() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_2_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s10
 ; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s12, s4
-; GFX900-NEXT:    s_mov_b32 s13, s5
-; GFX900-NEXT:    s_mov_b32 s14, s4
-; GFX900-NEXT:    s_mov_b32 s15, s5
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_4_4:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_2_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s10
 ; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s12, s4
-; GFX90A-NEXT:    s_mov_b32 s13, s5
-; GFX90A-NEXT:    s_mov_b32 s14, s4
-; GFX90A-NEXT:    s_mov_b32 s15, s5
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_4_4:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_2_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 4, i32 4>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 2, i32 3>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__7_7_u_4() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_u_4:
+define void @s_shuffle_v4p0_v4p0__7_7_4_3() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_4_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s10
 ; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s14, s4
-; GFX900-NEXT:    s_mov_b32 s15, s5
+; GFX900-NEXT:    s_mov_b32 s12, s4
+; GFX900-NEXT:    s_mov_b32 s13, s5
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_u_4:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_4_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s10
 ; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s14, s4
-; GFX90A-NEXT:    s_mov_b32 s15, s5
+; GFX90A-NEXT:    s_mov_b32 s12, s4
+; GFX90A-NEXT:    s_mov_b32 s13, s5
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_u_4:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_4_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 poison, i32 4>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 4, i32 3>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__7_7_0_4() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_0_4:
+define void @s_shuffle_v4p0_v4p0__7_7_5_3() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_5_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[16:23]
+; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s22
-; GFX900-NEXT:    s_mov_b32 s9, s23
-; GFX900-NEXT:    s_mov_b32 s10, s22
-; GFX900-NEXT:    s_mov_b32 s11, s23
-; GFX900-NEXT:    s_mov_b32 s12, s4
-; GFX900-NEXT:    s_mov_b32 s13, s5
-; GFX900-NEXT:    s_mov_b32 s14, s16
-; GFX900-NEXT:    s_mov_b32 s15, s17
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s12, s6
+; GFX900-NEXT:    s_mov_b32 s13, s7
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_0_4:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_5_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[16:23]
+; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s22
-; GFX90A-NEXT:    s_mov_b32 s9, s23
-; GFX90A-NEXT:    s_mov_b32 s10, s22
-; GFX90A-NEXT:    s_mov_b32 s11, s23
-; GFX90A-NEXT:    s_mov_b32 s12, s4
-; GFX90A-NEXT:    s_mov_b32 s13, s5
-; GFX90A-NEXT:    s_mov_b32 s14, s16
-; GFX90A-NEXT:    s_mov_b32 s15, s17
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s12, s6
+; GFX90A-NEXT:    s_mov_b32 s13, s7
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_0_4:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_5_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s0
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[4:11]
+; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 0, i32 4>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 5, i32 3>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__7_7_1_4() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_1_4:
+define void @s_shuffle_v4p0_v4p0__7_7_6_3() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_6_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[16:23]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s22
-; GFX900-NEXT:    s_mov_b32 s9, s23
-; GFX900-NEXT:    s_mov_b32 s10, s22
-; GFX900-NEXT:    s_mov_b32 s11, s23
-; GFX900-NEXT:    s_mov_b32 s12, s6
-; GFX900-NEXT:    s_mov_b32 s13, s7
-; GFX900-NEXT:    s_mov_b32 s14, s16
-; GFX900-NEXT:    s_mov_b32 s15, s17
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    s_mov_b32 s14, s18
+; GFX900-NEXT:    s_mov_b32 s15, s19
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_1_4:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_6_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[16:23]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s22
-; GFX90A-NEXT:    s_mov_b32 s9, s23
-; GFX90A-NEXT:    s_mov_b32 s10, s22
-; GFX90A-NEXT:    s_mov_b32 s11, s23
-; GFX90A-NEXT:    s_mov_b32 s12, s6
-; GFX90A-NEXT:    s_mov_b32 s13, s7
-; GFX90A-NEXT:    s_mov_b32 s14, s16
-; GFX90A-NEXT:    s_mov_b32 s15, s17
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    s_mov_b32 s14, s18
+; GFX90A-NEXT:    s_mov_b32 s15, s19
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_1_4:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_6_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s2
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[4:11]
+; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 1, i32 4>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 6, i32 3>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__7_7_2_4() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_2_4:
+define void @s_shuffle_v4p0_v4p0__u_4_4_4() {
+; GFX9-LABEL: s_shuffle_v4p0_v4p0__u_4_4_4:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; use s[8:15]
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> <i32 poison, i32 4, i32 4, i32 4>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4p0_v4p0__0_4_4_4() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__0_4_4_4:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s14, s4
-; GFX900-NEXT:    s_mov_b32 s15, s5
-; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_2_4:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__0_4_4_4:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s14, s4
-; GFX90A-NEXT:    s_mov_b32 s15, s5
-; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_2_4:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__0_4_4_4:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 2, i32 4>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> <i32 0, i32 4, i32 4, i32 4>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__7_7_3_4() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_3_4:
+define void @s_shuffle_v4p0_v4p0__1_4_4_4() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__1_4_4_4:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s12, s14
-; GFX900-NEXT:    s_mov_b32 s13, s15
-; GFX900-NEXT:    s_mov_b32 s14, s4
-; GFX900-NEXT:    s_mov_b32 s15, s5
+; GFX900-NEXT:    s_mov_b32 s8, s6
+; GFX900-NEXT:    s_mov_b32 s9, s7
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_3_4:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__1_4_4_4:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s12, s14
-; GFX90A-NEXT:    s_mov_b32 s13, s15
-; GFX90A-NEXT:    s_mov_b32 s14, s4
-; GFX90A-NEXT:    s_mov_b32 s15, s5
+; GFX90A-NEXT:    s_mov_b32 s8, s6
+; GFX90A-NEXT:    s_mov_b32 s9, s7
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_3_4:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__1_4_4_4:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[16:23]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s22
-; GFX942-NEXT:    s_mov_b32 s9, s23
-; GFX942-NEXT:    s_mov_b32 s10, s22
-; GFX942-NEXT:    s_mov_b32 s11, s23
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
-; GFX942-NEXT:    s_mov_b32 s14, s16
-; GFX942-NEXT:    s_mov_b32 s15, s17
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 3, i32 4>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> <i32 1, i32 4, i32 4, i32 4>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__7_7_5_4() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_5_4:
+define void @s_shuffle_v4p0_v4p0__2_4_4_4() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__2_4_4_4:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__2_4_4_4:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__2_4_4_4:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> <i32 2, i32 4, i32 4, i32 4>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4p0_v4p0__3_4_4_4() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__3_4_4_4:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
 ; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s12, s6
-; GFX900-NEXT:    s_mov_b32 s13, s7
-; GFX900-NEXT:    s_mov_b32 s14, s4
-; GFX900-NEXT:    s_mov_b32 s15, s5
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_5_4:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__3_4_4_4:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -21718,8 +20960,74 @@ define void @s_shuffle_v4p0_v4p0__7_7_5_4() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s10
 ; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s12, s6
-; GFX90A-NEXT:    s_mov_b32 s13, s7
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__3_4_4_4:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> <i32 3, i32 4, i32 4, i32 4>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4p0_v4p0__4_4_4_4() {
+; GFX9-LABEL: s_shuffle_v4p0_v4p0__4_4_4_4:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; use s[8:15]
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> <i32 4, i32 4, i32 4, i32 4>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4p0_v4p0__5_4_4_4() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__5_4_4_4:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s6
+; GFX900-NEXT:    s_mov_b32 s9, s7
+; GFX900-NEXT:    s_mov_b32 s10, s4
+; GFX900-NEXT:    s_mov_b32 s11, s5
+; GFX900-NEXT:    s_mov_b32 s12, s4
+; GFX900-NEXT:    s_mov_b32 s13, s5
+; GFX900-NEXT:    s_mov_b32 s14, s4
+; GFX900-NEXT:    s_mov_b32 s15, s5
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__5_4_4_4:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s6
+; GFX90A-NEXT:    s_mov_b32 s9, s7
+; GFX90A-NEXT:    s_mov_b32 s10, s4
+; GFX90A-NEXT:    s_mov_b32 s11, s5
+; GFX90A-NEXT:    s_mov_b32 s12, s4
+; GFX90A-NEXT:    s_mov_b32 s13, s5
 ; GFX90A-NEXT:    s_mov_b32 s14, s4
 ; GFX90A-NEXT:    s_mov_b32 s15, s5
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -21727,129 +21035,1656 @@ define void @s_shuffle_v4p0_v4p0__7_7_5_4() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_5_4:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__5_4_4_4:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 5, i32 4>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 5, i32 4, i32 4, i32 4>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__7_7_6_4() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_6_4:
+define void @s_shuffle_v4p0_v4p0__6_4_4_4() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__6_4_4_4:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[16:23]
+; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s22
-; GFX900-NEXT:    s_mov_b32 s9, s23
-; GFX900-NEXT:    s_mov_b32 s10, s22
-; GFX900-NEXT:    s_mov_b32 s11, s23
-; GFX900-NEXT:    s_mov_b32 s12, s20
-; GFX900-NEXT:    s_mov_b32 s13, s21
-; GFX900-NEXT:    s_mov_b32 s14, s16
-; GFX900-NEXT:    s_mov_b32 s15, s17
+; GFX900-NEXT:    s_mov_b32 s10, s4
+; GFX900-NEXT:    s_mov_b32 s11, s5
+; GFX900-NEXT:    s_mov_b32 s12, s4
+; GFX900-NEXT:    s_mov_b32 s13, s5
+; GFX900-NEXT:    s_mov_b32 s14, s4
+; GFX900-NEXT:    s_mov_b32 s15, s5
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_6_4:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__6_4_4_4:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[16:23]
+; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s22
-; GFX90A-NEXT:    s_mov_b32 s9, s23
-; GFX90A-NEXT:    s_mov_b32 s10, s22
-; GFX90A-NEXT:    s_mov_b32 s11, s23
-; GFX90A-NEXT:    s_mov_b32 s12, s20
-; GFX90A-NEXT:    s_mov_b32 s13, s21
-; GFX90A-NEXT:    s_mov_b32 s14, s16
-; GFX90A-NEXT:    s_mov_b32 s15, s17
+; GFX90A-NEXT:    s_mov_b32 s10, s4
+; GFX90A-NEXT:    s_mov_b32 s11, s5
+; GFX90A-NEXT:    s_mov_b32 s12, s4
+; GFX90A-NEXT:    s_mov_b32 s13, s5
+; GFX90A-NEXT:    s_mov_b32 s14, s4
+; GFX90A-NEXT:    s_mov_b32 s15, s5
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_6_4:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__6_4_4_4:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
+  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 6, i32 4, i32 4, i32 4>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4p0_v4p0__7_4_4_4() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_4_4_4:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s10, s4
+; GFX900-NEXT:    s_mov_b32 s11, s5
+; GFX900-NEXT:    s_mov_b32 s12, s4
+; GFX900-NEXT:    s_mov_b32 s13, s5
+; GFX900-NEXT:    s_mov_b32 s14, s4
+; GFX900-NEXT:    s_mov_b32 s15, s5
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_4_4_4:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s10, s4
+; GFX90A-NEXT:    s_mov_b32 s11, s5
+; GFX90A-NEXT:    s_mov_b32 s12, s4
+; GFX90A-NEXT:    s_mov_b32 s13, s5
+; GFX90A-NEXT:    s_mov_b32 s14, s4
+; GFX90A-NEXT:    s_mov_b32 s15, s5
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_4_4_4:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
+  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 4, i32 4, i32 4>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4p0_v4p0__7_u_4_4() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_u_4_4:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s12, s4
+; GFX900-NEXT:    s_mov_b32 s13, s5
+; GFX900-NEXT:    s_mov_b32 s14, s4
+; GFX900-NEXT:    s_mov_b32 s15, s5
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_u_4_4:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s12, s4
+; GFX90A-NEXT:    s_mov_b32 s13, s5
+; GFX90A-NEXT:    s_mov_b32 s14, s4
+; GFX90A-NEXT:    s_mov_b32 s15, s5
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_u_4_4:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
+  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 poison, i32 4, i32 4>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4p0_v4p0__7_0_4_4() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_0_4_4:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[16:23]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s22
+; GFX900-NEXT:    s_mov_b32 s9, s23
+; GFX900-NEXT:    s_mov_b32 s10, s4
+; GFX900-NEXT:    s_mov_b32 s11, s5
+; GFX900-NEXT:    s_mov_b32 s12, s16
+; GFX900-NEXT:    s_mov_b32 s13, s17
+; GFX900-NEXT:    s_mov_b32 s14, s16
+; GFX900-NEXT:    s_mov_b32 s15, s17
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_0_4_4:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[16:23]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s22
+; GFX90A-NEXT:    s_mov_b32 s9, s23
+; GFX90A-NEXT:    s_mov_b32 s10, s4
+; GFX90A-NEXT:    s_mov_b32 s11, s5
+; GFX90A-NEXT:    s_mov_b32 s12, s16
+; GFX90A-NEXT:    s_mov_b32 s13, s17
+; GFX90A-NEXT:    s_mov_b32 s14, s16
+; GFX90A-NEXT:    s_mov_b32 s15, s17
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_0_4_4:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[4:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
+  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 0, i32 4, i32 4>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4p0_v4p0__7_1_4_4() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_1_4_4:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[16:23]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s22
+; GFX900-NEXT:    s_mov_b32 s9, s23
+; GFX900-NEXT:    s_mov_b32 s12, s16
+; GFX900-NEXT:    s_mov_b32 s13, s17
+; GFX900-NEXT:    s_mov_b32 s14, s16
+; GFX900-NEXT:    s_mov_b32 s15, s17
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_1_4_4:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[16:23]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s22
+; GFX90A-NEXT:    s_mov_b32 s9, s23
+; GFX90A-NEXT:    s_mov_b32 s12, s16
+; GFX90A-NEXT:    s_mov_b32 s13, s17
+; GFX90A-NEXT:    s_mov_b32 s14, s16
+; GFX90A-NEXT:    s_mov_b32 s15, s17
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_1_4_4:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
+  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 1, i32 4, i32 4>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4p0_v4p0__7_2_4_4() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_2_4_4:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s12, s4
+; GFX900-NEXT:    s_mov_b32 s13, s5
+; GFX900-NEXT:    s_mov_b32 s14, s4
+; GFX900-NEXT:    s_mov_b32 s15, s5
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_2_4_4:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s12, s4
+; GFX90A-NEXT:    s_mov_b32 s13, s5
+; GFX90A-NEXT:    s_mov_b32 s14, s4
+; GFX90A-NEXT:    s_mov_b32 s15, s5
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_2_4_4:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[16:23]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[22:23]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[16:17]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[16:17]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
+  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 2, i32 4, i32 4>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4p0_v4p0__7_3_4_4() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_3_4_4:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[16:23]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s22
+; GFX900-NEXT:    s_mov_b32 s9, s23
+; GFX900-NEXT:    s_mov_b32 s12, s16
+; GFX900-NEXT:    s_mov_b32 s13, s17
+; GFX900-NEXT:    s_mov_b32 s14, s16
+; GFX900-NEXT:    s_mov_b32 s15, s17
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_3_4_4:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[16:23]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s22
+; GFX90A-NEXT:    s_mov_b32 s9, s23
+; GFX90A-NEXT:    s_mov_b32 s12, s16
+; GFX90A-NEXT:    s_mov_b32 s13, s17
+; GFX90A-NEXT:    s_mov_b32 s14, s16
+; GFX90A-NEXT:    s_mov_b32 s15, s17
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_3_4_4:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[16:23]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[22:23]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[16:17]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[16:17]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
+  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 3, i32 4, i32 4>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4p0_v4p0__7_5_4_4() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_5_4_4:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s10, s6
+; GFX900-NEXT:    s_mov_b32 s11, s7
+; GFX900-NEXT:    s_mov_b32 s12, s4
+; GFX900-NEXT:    s_mov_b32 s13, s5
+; GFX900-NEXT:    s_mov_b32 s14, s4
+; GFX900-NEXT:    s_mov_b32 s15, s5
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_5_4_4:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s10, s6
+; GFX90A-NEXT:    s_mov_b32 s11, s7
+; GFX90A-NEXT:    s_mov_b32 s12, s4
+; GFX90A-NEXT:    s_mov_b32 s13, s5
+; GFX90A-NEXT:    s_mov_b32 s14, s4
+; GFX90A-NEXT:    s_mov_b32 s15, s5
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_5_4_4:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
+  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 5, i32 4, i32 4>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4p0_v4p0__7_6_4_4() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_6_4_4:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[16:23]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s22
+; GFX900-NEXT:    s_mov_b32 s9, s23
+; GFX900-NEXT:    s_mov_b32 s10, s20
+; GFX900-NEXT:    s_mov_b32 s11, s21
+; GFX900-NEXT:    s_mov_b32 s12, s16
+; GFX900-NEXT:    s_mov_b32 s13, s17
+; GFX900-NEXT:    s_mov_b32 s14, s16
+; GFX900-NEXT:    s_mov_b32 s15, s17
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_6_4_4:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[16:23]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s22
+; GFX90A-NEXT:    s_mov_b32 s9, s23
+; GFX90A-NEXT:    s_mov_b32 s10, s20
+; GFX90A-NEXT:    s_mov_b32 s11, s21
+; GFX90A-NEXT:    s_mov_b32 s12, s16
+; GFX90A-NEXT:    s_mov_b32 s13, s17
+; GFX90A-NEXT:    s_mov_b32 s14, s16
+; GFX90A-NEXT:    s_mov_b32 s15, s17
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_6_4_4:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
+  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 6, i32 4, i32 4>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4p0_v4p0__7_7_4_4() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_4_4:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s12, s4
+; GFX900-NEXT:    s_mov_b32 s13, s5
+; GFX900-NEXT:    s_mov_b32 s14, s4
+; GFX900-NEXT:    s_mov_b32 s15, s5
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_4_4:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s12, s4
+; GFX90A-NEXT:    s_mov_b32 s13, s5
+; GFX90A-NEXT:    s_mov_b32 s14, s4
+; GFX90A-NEXT:    s_mov_b32 s15, s5
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_4_4:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
+  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 4, i32 4>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4p0_v4p0__7_7_u_4() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_u_4:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s14, s4
+; GFX900-NEXT:    s_mov_b32 s15, s5
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_u_4:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s4
+; GFX90A-NEXT:    s_mov_b32 s15, s5
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_u_4:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
+  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 poison, i32 4>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4p0_v4p0__7_7_0_4() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_0_4:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[16:23]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s22
+; GFX900-NEXT:    s_mov_b32 s9, s23
+; GFX900-NEXT:    s_mov_b32 s10, s22
+; GFX900-NEXT:    s_mov_b32 s11, s23
+; GFX900-NEXT:    s_mov_b32 s12, s4
+; GFX900-NEXT:    s_mov_b32 s13, s5
+; GFX900-NEXT:    s_mov_b32 s14, s16
+; GFX900-NEXT:    s_mov_b32 s15, s17
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_0_4:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[16:23]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s22
+; GFX90A-NEXT:    s_mov_b32 s9, s23
+; GFX90A-NEXT:    s_mov_b32 s10, s22
+; GFX90A-NEXT:    s_mov_b32 s11, s23
+; GFX90A-NEXT:    s_mov_b32 s12, s4
+; GFX90A-NEXT:    s_mov_b32 s13, s5
+; GFX90A-NEXT:    s_mov_b32 s14, s16
+; GFX90A-NEXT:    s_mov_b32 s15, s17
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_0_4:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[4:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
+  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 0, i32 4>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4p0_v4p0__7_7_1_4() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_1_4:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[16:23]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s22
+; GFX900-NEXT:    s_mov_b32 s9, s23
+; GFX900-NEXT:    s_mov_b32 s10, s22
+; GFX900-NEXT:    s_mov_b32 s11, s23
+; GFX900-NEXT:    s_mov_b32 s12, s6
+; GFX900-NEXT:    s_mov_b32 s13, s7
+; GFX900-NEXT:    s_mov_b32 s14, s16
+; GFX900-NEXT:    s_mov_b32 s15, s17
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_1_4:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[16:23]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s22
+; GFX90A-NEXT:    s_mov_b32 s9, s23
+; GFX90A-NEXT:    s_mov_b32 s10, s22
+; GFX90A-NEXT:    s_mov_b32 s11, s23
+; GFX90A-NEXT:    s_mov_b32 s12, s6
+; GFX90A-NEXT:    s_mov_b32 s13, s7
+; GFX90A-NEXT:    s_mov_b32 s14, s16
+; GFX90A-NEXT:    s_mov_b32 s15, s17
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_1_4:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[4:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
+  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 1, i32 4>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4p0_v4p0__7_7_2_4() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_2_4:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s14, s4
+; GFX900-NEXT:    s_mov_b32 s15, s5
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_2_4:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s4
+; GFX90A-NEXT:    s_mov_b32 s15, s5
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_2_4:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
+  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 2, i32 4>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4p0_v4p0__7_7_3_4() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_3_4:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    s_mov_b32 s14, s4
+; GFX900-NEXT:    s_mov_b32 s15, s5
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_3_4:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    s_mov_b32 s14, s4
+; GFX90A-NEXT:    s_mov_b32 s15, s5
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_3_4:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[16:23]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[22:23]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[22:23]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[16:17]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
+  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 3, i32 4>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4p0_v4p0__7_7_5_4() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_5_4:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s12, s6
+; GFX900-NEXT:    s_mov_b32 s13, s7
+; GFX900-NEXT:    s_mov_b32 s14, s4
+; GFX900-NEXT:    s_mov_b32 s15, s5
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_5_4:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s12, s6
+; GFX90A-NEXT:    s_mov_b32 s13, s7
+; GFX90A-NEXT:    s_mov_b32 s14, s4
+; GFX90A-NEXT:    s_mov_b32 s15, s5
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_5_4:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
+  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 5, i32 4>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4p0_v4p0__7_7_6_4() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_6_4:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[16:23]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s22
+; GFX900-NEXT:    s_mov_b32 s9, s23
+; GFX900-NEXT:    s_mov_b32 s10, s22
+; GFX900-NEXT:    s_mov_b32 s11, s23
+; GFX900-NEXT:    s_mov_b32 s12, s20
+; GFX900-NEXT:    s_mov_b32 s13, s21
+; GFX900-NEXT:    s_mov_b32 s14, s16
+; GFX900-NEXT:    s_mov_b32 s15, s17
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_6_4:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[16:23]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s22
+; GFX90A-NEXT:    s_mov_b32 s9, s23
+; GFX90A-NEXT:    s_mov_b32 s10, s22
+; GFX90A-NEXT:    s_mov_b32 s11, s23
+; GFX90A-NEXT:    s_mov_b32 s12, s20
+; GFX90A-NEXT:    s_mov_b32 s13, s21
+; GFX90A-NEXT:    s_mov_b32 s14, s16
+; GFX90A-NEXT:    s_mov_b32 s15, s17
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_6_4:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
+  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 6, i32 4>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4p0_v4p0__u_5_5_5() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__u_5_5_5:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    s_mov_b32 s14, s10
+; GFX900-NEXT:    s_mov_b32 s15, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__u_5_5_5:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s10
+; GFX90A-NEXT:    s_mov_b32 s15, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__u_5_5_5:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
+  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 poison, i32 5, i32 5, i32 5>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4p0_v4p0__0_5_5_5() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__0_5_5_5:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__0_5_5_5:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__0_5_5_5:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
+  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 0, i32 5, i32 5, i32 5>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4p0_v4p0__1_5_5_5() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__1_5_5_5:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s6
+; GFX900-NEXT:    s_mov_b32 s9, s7
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    s_mov_b32 s14, s10
+; GFX900-NEXT:    s_mov_b32 s15, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__1_5_5_5:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s6
+; GFX90A-NEXT:    s_mov_b32 s9, s7
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s10
+; GFX90A-NEXT:    s_mov_b32 s15, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__1_5_5_5:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
+  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 1, i32 5, i32 5, i32 5>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4p0_v4p0__2_5_5_5() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__2_5_5_5:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s16
+; GFX900-NEXT:    s_mov_b32 s9, s17
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    s_mov_b32 s14, s10
+; GFX900-NEXT:    s_mov_b32 s15, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__2_5_5_5:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s16
+; GFX90A-NEXT:    s_mov_b32 s9, s17
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s10
+; GFX90A-NEXT:    s_mov_b32 s15, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__2_5_5_5:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
+  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 2, i32 5, i32 5, i32 5>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4p0_v4p0__3_5_5_5() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__3_5_5_5:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s18
+; GFX900-NEXT:    s_mov_b32 s9, s19
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    s_mov_b32 s14, s10
+; GFX900-NEXT:    s_mov_b32 s15, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__3_5_5_5:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s18
+; GFX90A-NEXT:    s_mov_b32 s9, s19
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s10
+; GFX90A-NEXT:    s_mov_b32 s15, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__3_5_5_5:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
+  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 3, i32 5, i32 5, i32 5>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4p0_v4p0__4_5_5_5() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__4_5_5_5:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    s_mov_b32 s14, s10
+; GFX900-NEXT:    s_mov_b32 s15, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__4_5_5_5:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s10
+; GFX90A-NEXT:    s_mov_b32 s15, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__4_5_5_5:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
+  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 4, i32 5, i32 5, i32 5>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4p0_v4p0__5_5_5_5() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__5_5_5_5:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    s_mov_b32 s14, s10
+; GFX900-NEXT:    s_mov_b32 s15, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__5_5_5_5:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s10
+; GFX90A-NEXT:    s_mov_b32 s15, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__5_5_5_5:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
+  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 5, i32 5, i32 5, i32 5>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4p0_v4p0__6_5_5_5() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__6_5_5_5:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s12
+; GFX900-NEXT:    s_mov_b32 s9, s13
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    s_mov_b32 s14, s10
+; GFX900-NEXT:    s_mov_b32 s15, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__6_5_5_5:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s12
+; GFX90A-NEXT:    s_mov_b32 s9, s13
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s10
+; GFX90A-NEXT:    s_mov_b32 s15, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__6_5_5_5:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
+  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 6, i32 5, i32 5, i32 5>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4p0_v4p0__7_5_5_5() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_5_5_5:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    s_mov_b32 s14, s10
+; GFX900-NEXT:    s_mov_b32 s15, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_5_5_5:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s10
+; GFX90A-NEXT:    s_mov_b32 s15, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_5_5_5:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
+  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 5, i32 5, i32 5>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4p0_v4p0__7_u_5_5() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_u_5_5:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s12, s6
+; GFX900-NEXT:    s_mov_b32 s13, s7
+; GFX900-NEXT:    s_mov_b32 s14, s6
+; GFX900-NEXT:    s_mov_b32 s15, s7
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_u_5_5:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s12, s6
+; GFX90A-NEXT:    s_mov_b32 s13, s7
+; GFX90A-NEXT:    s_mov_b32 s14, s6
+; GFX90A-NEXT:    s_mov_b32 s15, s7
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_u_5_5:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s4
-; GFX942-NEXT:    s_mov_b32 s13, s5
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 6, i32 4>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4p0_v4p0__u_5_5_5() {
-; GFX9-LABEL: s_shuffle_v4p0_v4p0__u_5_5_5:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    s_mov_b32 s14, s10
-; GFX9-NEXT:    s_mov_b32 s15, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 poison, i32 5, i32 5, i32 5>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 poison, i32 5, i32 5>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__0_5_5_5() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__0_5_5_5:
+define void @s_shuffle_v4p0_v4p0__7_0_5_5() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_0_5_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s10, s14
-; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    s_mov_b32 s8, s18
+; GFX900-NEXT:    s_mov_b32 s9, s19
+; GFX900-NEXT:    s_mov_b32 s10, s4
+; GFX900-NEXT:    s_mov_b32 s11, s5
 ; GFX900-NEXT:    s_mov_b32 s12, s14
 ; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -21857,17 +22692,19 @@ define void @s_shuffle_v4p0_v4p0__0_5_5_5() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__0_5_5_5:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_0_5_5:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s10, s14
-; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    s_mov_b32 s8, s18
+; GFX90A-NEXT:    s_mov_b32 s9, s19
+; GFX90A-NEXT:    s_mov_b32 s10, s4
+; GFX90A-NEXT:    s_mov_b32 s11, s5
 ; GFX90A-NEXT:    s_mov_b32 s12, s14
 ; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -21875,74 +22712,69 @@ define void @s_shuffle_v4p0_v4p0__0_5_5_5() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__0_5_5_5:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_0_5_5:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s2
-; GFX942-NEXT:    s_mov_b32 s11, s3
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 0, i32 5, i32 5, i32 5>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 0, i32 5, i32 5>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__1_5_5_5() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__1_5_5_5:
+define void @s_shuffle_v4p0_v4p0__7_1_5_5() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_1_5_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s6
-; GFX900-NEXT:    s_mov_b32 s9, s7
-; GFX900-NEXT:    s_mov_b32 s12, s10
-; GFX900-NEXT:    s_mov_b32 s13, s11
-; GFX900-NEXT:    s_mov_b32 s14, s10
-; GFX900-NEXT:    s_mov_b32 s15, s11
+; GFX900-NEXT:    s_mov_b32 s8, s18
+; GFX900-NEXT:    s_mov_b32 s9, s19
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__1_5_5_5:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_1_5_5:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s6
-; GFX90A-NEXT:    s_mov_b32 s9, s7
-; GFX90A-NEXT:    s_mov_b32 s12, s10
-; GFX90A-NEXT:    s_mov_b32 s13, s11
-; GFX90A-NEXT:    s_mov_b32 s14, s10
-; GFX90A-NEXT:    s_mov_b32 s15, s11
+; GFX90A-NEXT:    s_mov_b32 s8, s18
+; GFX90A-NEXT:    s_mov_b32 s9, s19
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__1_5_5_5:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_1_5_5:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -21951,253 +22783,150 @@ define void @s_shuffle_v4p0_v4p0__1_5_5_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s12, s10
-; GFX942-NEXT:    s_mov_b32 s13, s11
-; GFX942-NEXT:    s_mov_b32 s14, s10
-; GFX942-NEXT:    s_mov_b32 s15, s11
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 1, i32 5, i32 5, i32 5>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 1, i32 5, i32 5>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__2_5_5_5() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__2_5_5_5:
+define void @s_shuffle_v4p0_v4p0__7_2_5_5() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_2_5_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s16
-; GFX900-NEXT:    s_mov_b32 s9, s17
-; GFX900-NEXT:    s_mov_b32 s12, s10
-; GFX900-NEXT:    s_mov_b32 s13, s11
-; GFX900-NEXT:    s_mov_b32 s14, s10
-; GFX900-NEXT:    s_mov_b32 s15, s11
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s12, s6
+; GFX900-NEXT:    s_mov_b32 s13, s7
+; GFX900-NEXT:    s_mov_b32 s14, s6
+; GFX900-NEXT:    s_mov_b32 s15, s7
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__2_5_5_5:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_2_5_5:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s16
-; GFX90A-NEXT:    s_mov_b32 s9, s17
-; GFX90A-NEXT:    s_mov_b32 s12, s10
-; GFX90A-NEXT:    s_mov_b32 s13, s11
-; GFX90A-NEXT:    s_mov_b32 s14, s10
-; GFX90A-NEXT:    s_mov_b32 s15, s11
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s12, s6
+; GFX90A-NEXT:    s_mov_b32 s13, s7
+; GFX90A-NEXT:    s_mov_b32 s14, s6
+; GFX90A-NEXT:    s_mov_b32 s15, s7
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__2_5_5_5:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_2_5_5:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ; def s[12:19]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s12, s10
-; GFX942-NEXT:    s_mov_b32 s13, s11
-; GFX942-NEXT:    s_mov_b32 s14, s10
-; GFX942-NEXT:    s_mov_b32 s15, s11
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[18:19]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 2, i32 5, i32 5, i32 5>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 2, i32 5, i32 5>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__3_5_5_5() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__3_5_5_5:
+define void @s_shuffle_v4p0_v4p0__7_3_5_5() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_3_5_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s18
 ; GFX900-NEXT:    s_mov_b32 s9, s19
-; GFX900-NEXT:    s_mov_b32 s12, s10
-; GFX900-NEXT:    s_mov_b32 s13, s11
-; GFX900-NEXT:    s_mov_b32 s14, s10
-; GFX900-NEXT:    s_mov_b32 s15, s11
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__3_5_5_5:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_3_5_5:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s18
 ; GFX90A-NEXT:    s_mov_b32 s9, s19
-; GFX90A-NEXT:    s_mov_b32 s12, s10
-; GFX90A-NEXT:    s_mov_b32 s13, s11
-; GFX90A-NEXT:    s_mov_b32 s14, s10
-; GFX90A-NEXT:    s_mov_b32 s15, s11
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__3_5_5_5:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_3_5_5:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ; def s[12:19]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s12, s10
-; GFX942-NEXT:    s_mov_b32 s13, s11
-; GFX942-NEXT:    s_mov_b32 s14, s10
-; GFX942-NEXT:    s_mov_b32 s15, s11
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[18:19]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 3, i32 5, i32 5, i32 5>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4p0_v4p0__4_5_5_5() {
-; GFX9-LABEL: s_shuffle_v4p0_v4p0__4_5_5_5:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    s_mov_b32 s14, s10
-; GFX9-NEXT:    s_mov_b32 s15, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 4, i32 5, i32 5, i32 5>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4p0_v4p0__5_5_5_5() {
-; GFX9-LABEL: s_shuffle_v4p0_v4p0__5_5_5_5:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    s_mov_b32 s14, s10
-; GFX9-NEXT:    s_mov_b32 s15, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 5, i32 5, i32 5, i32 5>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4p0_v4p0__6_5_5_5() {
-; GFX9-LABEL: s_shuffle_v4p0_v4p0__6_5_5_5:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s12
-; GFX9-NEXT:    s_mov_b32 s9, s13
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    s_mov_b32 s14, s10
-; GFX9-NEXT:    s_mov_b32 s15, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 6, i32 5, i32 5, i32 5>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4p0_v4p0__7_5_5_5() {
-; GFX9-LABEL: s_shuffle_v4p0_v4p0__7_5_5_5:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    s_mov_b32 s14, s10
-; GFX9-NEXT:    s_mov_b32 s15, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 5, i32 5, i32 5>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 3, i32 5, i32 5>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__7_u_5_5() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_u_5_5:
+define void @s_shuffle_v4p0_v4p0__7_4_5_5() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_4_5_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -22205,6 +22934,8 @@ define void @s_shuffle_v4p0_v4p0__7_u_5_5() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s10
 ; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s10, s4
+; GFX900-NEXT:    s_mov_b32 s11, s5
 ; GFX900-NEXT:    s_mov_b32 s12, s6
 ; GFX900-NEXT:    s_mov_b32 s13, s7
 ; GFX900-NEXT:    s_mov_b32 s14, s6
@@ -22214,7 +22945,7 @@ define void @s_shuffle_v4p0_v4p0__7_u_5_5() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_u_5_5:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_4_5_5:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -22222,6 +22953,8 @@ define void @s_shuffle_v4p0_v4p0__7_u_5_5() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s10
 ; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s10, s4
+; GFX90A-NEXT:    s_mov_b32 s11, s5
 ; GFX90A-NEXT:    s_mov_b32 s12, s6
 ; GFX90A-NEXT:    s_mov_b32 s13, s7
 ; GFX90A-NEXT:    s_mov_b32 s14, s6
@@ -22231,43 +22964,38 @@ define void @s_shuffle_v4p0_v4p0__7_u_5_5() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_u_5_5:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_4_5_5:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 poison, i32 5, i32 5>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 4, i32 5, i32 5>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__7_0_5_5() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_0_5_5:
+define void @s_shuffle_v4p0_v4p0__7_6_5_5() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_6_5_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s18
 ; GFX900-NEXT:    s_mov_b32 s9, s19
-; GFX900-NEXT:    s_mov_b32 s10, s4
-; GFX900-NEXT:    s_mov_b32 s11, s5
+; GFX900-NEXT:    s_mov_b32 s10, s16
+; GFX900-NEXT:    s_mov_b32 s11, s17
 ; GFX900-NEXT:    s_mov_b32 s12, s14
 ; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -22275,19 +23003,16 @@ define void @s_shuffle_v4p0_v4p0__7_0_5_5() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_0_5_5:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_6_5_5:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s18
 ; GFX90A-NEXT:    s_mov_b32 s9, s19
-; GFX90A-NEXT:    s_mov_b32 s10, s4
-; GFX90A-NEXT:    s_mov_b32 s11, s5
+; GFX90A-NEXT:    s_mov_b32 s10, s16
+; GFX90A-NEXT:    s_mov_b32 s11, s17
 ; GFX90A-NEXT:    s_mov_b32 s12, s14
 ; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -22295,114 +23020,92 @@ define void @s_shuffle_v4p0_v4p0__7_0_5_5() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_0_5_5:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_6_5_5:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[4:11]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
-; GFX942-NEXT:    s_mov_b32 s14, s6
-; GFX942-NEXT:    s_mov_b32 s15, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 0, i32 5, i32 5>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 6, i32 5, i32 5>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__7_1_5_5() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_1_5_5:
+define void @s_shuffle_v4p0_v4p0__7_7_5_5() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_5_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s18
-; GFX900-NEXT:    s_mov_b32 s9, s19
-; GFX900-NEXT:    s_mov_b32 s12, s14
-; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s12, s6
+; GFX900-NEXT:    s_mov_b32 s13, s7
+; GFX900-NEXT:    s_mov_b32 s14, s6
+; GFX900-NEXT:    s_mov_b32 s15, s7
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_1_5_5:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_5_5:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s18
-; GFX90A-NEXT:    s_mov_b32 s9, s19
-; GFX90A-NEXT:    s_mov_b32 s12, s14
-; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s12, s6
+; GFX90A-NEXT:    s_mov_b32 s13, s7
+; GFX90A-NEXT:    s_mov_b32 s14, s6
+; GFX90A-NEXT:    s_mov_b32 s15, s7
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_1_5_5:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_5_5:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[8:15]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 1, i32 5, i32 5>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 5, i32 5>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__7_2_5_5() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_2_5_5:
+define void @s_shuffle_v4p0_v4p0__7_7_u_5() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_u_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s10
 ; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s10, s12
-; GFX900-NEXT:    s_mov_b32 s11, s13
-; GFX900-NEXT:    s_mov_b32 s12, s6
-; GFX900-NEXT:    s_mov_b32 s13, s7
 ; GFX900-NEXT:    s_mov_b32 s14, s6
 ; GFX900-NEXT:    s_mov_b32 s15, s7
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -22410,21 +23113,14 @@ define void @s_shuffle_v4p0_v4p0__7_2_5_5() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_2_5_5:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_u_5:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s10
 ; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s10, s12
-; GFX90A-NEXT:    s_mov_b32 s11, s13
-; GFX90A-NEXT:    s_mov_b32 s12, s6
-; GFX90A-NEXT:    s_mov_b32 s13, s7
 ; GFX90A-NEXT:    s_mov_b32 s14, s6
 ; GFX90A-NEXT:    s_mov_b32 s15, s7
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -22432,34 +23128,28 @@ define void @s_shuffle_v4p0_v4p0__7_2_5_5() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_2_5_5:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_u_5:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[12:19]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s18
-; GFX942-NEXT:    s_mov_b32 s9, s19
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 2, i32 5, i32 5>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 poison, i32 5>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__7_3_5_5() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_3_5_5:
+define void @s_shuffle_v4p0_v4p0__7_7_0_5() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_0_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -22470,14 +23160,16 @@ define void @s_shuffle_v4p0_v4p0__7_3_5_5() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s18
 ; GFX900-NEXT:    s_mov_b32 s9, s19
-; GFX900-NEXT:    s_mov_b32 s12, s14
-; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    s_mov_b32 s10, s18
+; GFX900-NEXT:    s_mov_b32 s11, s19
+; GFX900-NEXT:    s_mov_b32 s12, s4
+; GFX900-NEXT:    s_mov_b32 s13, s5
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_3_5_5:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_0_5:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -22488,174 +23180,176 @@ define void @s_shuffle_v4p0_v4p0__7_3_5_5() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s18
 ; GFX90A-NEXT:    s_mov_b32 s9, s19
-; GFX90A-NEXT:    s_mov_b32 s12, s14
-; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    s_mov_b32 s10, s18
+; GFX90A-NEXT:    s_mov_b32 s11, s19
+; GFX90A-NEXT:    s_mov_b32 s12, s4
+; GFX90A-NEXT:    s_mov_b32 s13, s5
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_3_5_5:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_0_5:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[12:19]
+; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s18
-; GFX942-NEXT:    s_mov_b32 s9, s19
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 3, i32 5, i32 5>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 0, i32 5>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__7_4_5_5() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_4_5_5:
+define void @s_shuffle_v4p0_v4p0__7_7_1_5() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_1_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s10, s4
-; GFX900-NEXT:    s_mov_b32 s11, s5
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s18
+; GFX900-NEXT:    s_mov_b32 s9, s19
+; GFX900-NEXT:    s_mov_b32 s10, s18
+; GFX900-NEXT:    s_mov_b32 s11, s19
 ; GFX900-NEXT:    s_mov_b32 s12, s6
 ; GFX900-NEXT:    s_mov_b32 s13, s7
-; GFX900-NEXT:    s_mov_b32 s14, s6
-; GFX900-NEXT:    s_mov_b32 s15, s7
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_4_5_5:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_1_5:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s10, s4
-; GFX90A-NEXT:    s_mov_b32 s11, s5
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s18
+; GFX90A-NEXT:    s_mov_b32 s9, s19
+; GFX90A-NEXT:    s_mov_b32 s10, s18
+; GFX90A-NEXT:    s_mov_b32 s11, s19
 ; GFX90A-NEXT:    s_mov_b32 s12, s6
 ; GFX90A-NEXT:    s_mov_b32 s13, s7
-; GFX90A-NEXT:    s_mov_b32 s14, s6
-; GFX90A-NEXT:    s_mov_b32 s15, s7
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_4_5_5:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_1_5:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[4:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 4, i32 5, i32 5>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 1, i32 5>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__7_6_5_5() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_6_5_5:
+define void @s_shuffle_v4p0_v4p0__7_7_2_5() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_2_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s18
-; GFX900-NEXT:    s_mov_b32 s9, s19
-; GFX900-NEXT:    s_mov_b32 s10, s16
-; GFX900-NEXT:    s_mov_b32 s11, s17
-; GFX900-NEXT:    s_mov_b32 s12, s14
-; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s14, s6
+; GFX900-NEXT:    s_mov_b32 s15, s7
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_6_5_5:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_2_5:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s18
-; GFX90A-NEXT:    s_mov_b32 s9, s19
-; GFX90A-NEXT:    s_mov_b32 s10, s16
-; GFX90A-NEXT:    s_mov_b32 s11, s17
-; GFX90A-NEXT:    s_mov_b32 s12, s14
-; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s6
+; GFX90A-NEXT:    s_mov_b32 s15, s7
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_6_5_5:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_2_5:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 6, i32 5, i32 5>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 2, i32 5>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__7_7_5_5() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_5_5:
+define void @s_shuffle_v4p0_v4p0__7_7_3_5() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_3_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s10
 ; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s12, s6
-; GFX900-NEXT:    s_mov_b32 s13, s7
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    s_mov_b32 s14, s6
 ; GFX900-NEXT:    s_mov_b32 s15, s7
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -22663,16 +23357,19 @@ define void @s_shuffle_v4p0_v4p0__7_7_5_5() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_5_5:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_3_5:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s10
 ; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s12, s6
-; GFX90A-NEXT:    s_mov_b32 s13, s7
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    s_mov_b32 s14, s6
 ; GFX90A-NEXT:    s_mov_b32 s15, s7
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -22680,33 +23377,31 @@ define void @s_shuffle_v4p0_v4p0__7_7_5_5() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_5_5:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_3_5:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[12:19]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[18:19]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[18:19]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 5, i32 5>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 3, i32 5>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__7_7_u_5() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_u_5:
+define void @s_shuffle_v4p0_v4p0__7_7_4_5() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_4_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -22714,6 +23409,8 @@ define void @s_shuffle_v4p0_v4p0__7_7_u_5() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s10
 ; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s12, s4
+; GFX900-NEXT:    s_mov_b32 s13, s5
 ; GFX900-NEXT:    s_mov_b32 s14, s6
 ; GFX900-NEXT:    s_mov_b32 s15, s7
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -22721,7 +23418,7 @@ define void @s_shuffle_v4p0_v4p0__7_7_u_5() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_u_5:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_4_5:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -22729,6 +23426,8 @@ define void @s_shuffle_v4p0_v4p0__7_7_u_5() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s10
 ; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s12, s4
+; GFX90A-NEXT:    s_mov_b32 s13, s5
 ; GFX90A-NEXT:    s_mov_b32 s14, s6
 ; GFX90A-NEXT:    s_mov_b32 s15, s7
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -22736,201 +23435,175 @@ define void @s_shuffle_v4p0_v4p0__7_7_u_5() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_u_5:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_4_5:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 poison, i32 5>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 4, i32 5>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__7_7_0_5() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_0_5:
+define void @s_shuffle_v4p0_v4p0__7_7_6_5() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_6_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s18
 ; GFX900-NEXT:    s_mov_b32 s9, s19
 ; GFX900-NEXT:    s_mov_b32 s10, s18
 ; GFX900-NEXT:    s_mov_b32 s11, s19
-; GFX900-NEXT:    s_mov_b32 s12, s4
-; GFX900-NEXT:    s_mov_b32 s13, s5
+; GFX900-NEXT:    s_mov_b32 s12, s16
+; GFX900-NEXT:    s_mov_b32 s13, s17
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_0_5:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_6_5:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s18
 ; GFX90A-NEXT:    s_mov_b32 s9, s19
 ; GFX90A-NEXT:    s_mov_b32 s10, s18
 ; GFX90A-NEXT:    s_mov_b32 s11, s19
-; GFX90A-NEXT:    s_mov_b32 s12, s4
-; GFX90A-NEXT:    s_mov_b32 s13, s5
+; GFX90A-NEXT:    s_mov_b32 s12, s16
+; GFX90A-NEXT:    s_mov_b32 s13, s17
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_0_5:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_6_5:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[4:11]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s6
-; GFX942-NEXT:    s_mov_b32 s15, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 0, i32 5>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 6, i32 5>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__7_7_1_5() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_1_5:
+define void @s_shuffle_v4p0_v4p0__u_6_6_6() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__u_6_6_6:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s18
-; GFX900-NEXT:    s_mov_b32 s9, s19
-; GFX900-NEXT:    s_mov_b32 s10, s18
-; GFX900-NEXT:    s_mov_b32 s11, s19
-; GFX900-NEXT:    s_mov_b32 s12, s6
-; GFX900-NEXT:    s_mov_b32 s13, s7
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_1_5:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__u_6_6_6:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s18
-; GFX90A-NEXT:    s_mov_b32 s9, s19
-; GFX90A-NEXT:    s_mov_b32 s10, s18
-; GFX90A-NEXT:    s_mov_b32 s11, s19
-; GFX90A-NEXT:    s_mov_b32 s12, s6
-; GFX90A-NEXT:    s_mov_b32 s13, s7
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_1_5:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__u_6_6_6:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[4:11]
+; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s6
-; GFX942-NEXT:    s_mov_b32 s15, s7
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 1, i32 5>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 poison, i32 6, i32 6, i32 6>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__7_7_2_5() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_2_5:
+define void @s_shuffle_v4p0_v4p0__0_6_6_6() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__0_6_6_6:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s14, s6
-; GFX900-NEXT:    s_mov_b32 s15, s7
+; GFX900-NEXT:    s_mov_b32 s10, s16
+; GFX900-NEXT:    s_mov_b32 s11, s17
+; GFX900-NEXT:    s_mov_b32 s12, s16
+; GFX900-NEXT:    s_mov_b32 s13, s17
+; GFX900-NEXT:    s_mov_b32 s14, s16
+; GFX900-NEXT:    s_mov_b32 s15, s17
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_2_5:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__0_6_6_6:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s14, s6
-; GFX90A-NEXT:    s_mov_b32 s15, s7
+; GFX90A-NEXT:    s_mov_b32 s10, s16
+; GFX90A-NEXT:    s_mov_b32 s11, s17
+; GFX90A-NEXT:    s_mov_b32 s12, s16
+; GFX90A-NEXT:    s_mov_b32 s13, s17
+; GFX90A-NEXT:    s_mov_b32 s14, s16
+; GFX90A-NEXT:    s_mov_b32 s15, s17
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_2_5:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__0_6_6_6:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -22939,311 +23612,271 @@ define void @s_shuffle_v4p0_v4p0__7_7_2_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 2, i32 5>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 0, i32 6, i32 6, i32 6>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__7_7_3_5() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_3_5:
+define void @s_shuffle_v4p0_v4p0__1_6_6_6() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__1_6_6_6:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s12, s14
-; GFX900-NEXT:    s_mov_b32 s13, s15
-; GFX900-NEXT:    s_mov_b32 s14, s6
-; GFX900-NEXT:    s_mov_b32 s15, s7
+; GFX900-NEXT:    s_mov_b32 s8, s6
+; GFX900-NEXT:    s_mov_b32 s9, s7
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_3_5:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__1_6_6_6:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s12, s14
-; GFX90A-NEXT:    s_mov_b32 s13, s15
-; GFX90A-NEXT:    s_mov_b32 s14, s6
-; GFX90A-NEXT:    s_mov_b32 s15, s7
+; GFX90A-NEXT:    s_mov_b32 s8, s6
+; GFX90A-NEXT:    s_mov_b32 s9, s7
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_3_5:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__1_6_6_6:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[12:19]
+; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s18
-; GFX942-NEXT:    s_mov_b32 s9, s19
-; GFX942-NEXT:    s_mov_b32 s10, s18
-; GFX942-NEXT:    s_mov_b32 s11, s19
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 3, i32 5>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 1, i32 6, i32 6, i32 6>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__7_7_4_5() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_4_5:
+define void @s_shuffle_v4p0_v4p0__2_6_6_6() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__2_6_6_6:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s12, s4
-; GFX900-NEXT:    s_mov_b32 s13, s5
-; GFX900-NEXT:    s_mov_b32 s14, s6
-; GFX900-NEXT:    s_mov_b32 s15, s7
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s16
+; GFX900-NEXT:    s_mov_b32 s9, s17
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_4_5:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__2_6_6_6:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s12, s4
-; GFX90A-NEXT:    s_mov_b32 s13, s5
-; GFX90A-NEXT:    s_mov_b32 s14, s6
-; GFX90A-NEXT:    s_mov_b32 s15, s7
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s16
+; GFX90A-NEXT:    s_mov_b32 s9, s17
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_4_5:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__2_6_6_6:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 4, i32 5>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 2, i32 6, i32 6, i32 6>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__7_7_6_5() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_6_5:
+define void @s_shuffle_v4p0_v4p0__3_6_6_6() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__3_6_6_6:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s18
 ; GFX900-NEXT:    s_mov_b32 s9, s19
-; GFX900-NEXT:    s_mov_b32 s10, s18
-; GFX900-NEXT:    s_mov_b32 s11, s19
-; GFX900-NEXT:    s_mov_b32 s12, s16
-; GFX900-NEXT:    s_mov_b32 s13, s17
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_6_5:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__3_6_6_6:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s18
 ; GFX90A-NEXT:    s_mov_b32 s9, s19
-; GFX90A-NEXT:    s_mov_b32 s10, s18
-; GFX90A-NEXT:    s_mov_b32 s11, s19
-; GFX90A-NEXT:    s_mov_b32 s12, s16
-; GFX90A-NEXT:    s_mov_b32 s13, s17
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_6_5:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__3_6_6_6:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s4
-; GFX942-NEXT:    s_mov_b32 s13, s5
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 6, i32 5>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4p0_v4p0__u_6_6_6() {
-; GFX9-LABEL: s_shuffle_v4p0_v4p0__u_6_6_6:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    s_mov_b32 s14, s12
-; GFX9-NEXT:    s_mov_b32 s15, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 poison, i32 6, i32 6, i32 6>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 3, i32 6, i32 6, i32 6>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__0_6_6_6() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__0_6_6_6:
+define void @s_shuffle_v4p0_v4p0__4_6_6_6() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__4_6_6_6:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s10, s16
-; GFX900-NEXT:    s_mov_b32 s11, s17
-; GFX900-NEXT:    s_mov_b32 s12, s16
-; GFX900-NEXT:    s_mov_b32 s13, s17
-; GFX900-NEXT:    s_mov_b32 s14, s16
-; GFX900-NEXT:    s_mov_b32 s15, s17
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__0_6_6_6:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__4_6_6_6:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s10, s16
-; GFX90A-NEXT:    s_mov_b32 s11, s17
-; GFX90A-NEXT:    s_mov_b32 s12, s16
-; GFX90A-NEXT:    s_mov_b32 s13, s17
-; GFX90A-NEXT:    s_mov_b32 s14, s16
-; GFX90A-NEXT:    s_mov_b32 s15, s17
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__0_6_6_6:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__4_6_6_6:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s4
-; GFX942-NEXT:    s_mov_b32 s13, s5
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 0, i32 6, i32 6, i32 6>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 4, i32 6, i32 6, i32 6>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__1_6_6_6() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__1_6_6_6:
+define void @s_shuffle_v4p0_v4p0__5_6_6_6() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__5_6_6_6:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s6
-; GFX900-NEXT:    s_mov_b32 s9, s7
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
 ; GFX900-NEXT:    s_mov_b32 s10, s12
 ; GFX900-NEXT:    s_mov_b32 s11, s13
 ; GFX900-NEXT:    s_mov_b32 s14, s12
@@ -23253,17 +23886,14 @@ define void @s_shuffle_v4p0_v4p0__1_6_6_6() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__1_6_6_6:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__5_6_6_6:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s6
-; GFX90A-NEXT:    s_mov_b32 s9, s7
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
 ; GFX90A-NEXT:    s_mov_b32 s10, s12
 ; GFX90A-NEXT:    s_mov_b32 s11, s13
 ; GFX90A-NEXT:    s_mov_b32 s14, s12
@@ -23273,44 +23903,35 @@ define void @s_shuffle_v4p0_v4p0__1_6_6_6() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__1_6_6_6:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__5_6_6_6:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s14, s12
-; GFX942-NEXT:    s_mov_b32 s15, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 1, i32 6, i32 6, i32 6>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 5, i32 6, i32 6, i32 6>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__2_6_6_6() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__2_6_6_6:
+define void @s_shuffle_v4p0_v4p0__6_6_6_6() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__6_6_6_6:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s16
-; GFX900-NEXT:    s_mov_b32 s9, s17
+; GFX900-NEXT:    s_mov_b32 s8, s12
+; GFX900-NEXT:    s_mov_b32 s9, s13
 ; GFX900-NEXT:    s_mov_b32 s10, s12
 ; GFX900-NEXT:    s_mov_b32 s11, s13
 ; GFX900-NEXT:    s_mov_b32 s14, s12
@@ -23320,17 +23941,14 @@ define void @s_shuffle_v4p0_v4p0__2_6_6_6() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__2_6_6_6:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__6_6_6_6:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s16
-; GFX90A-NEXT:    s_mov_b32 s9, s17
+; GFX90A-NEXT:    s_mov_b32 s8, s12
+; GFX90A-NEXT:    s_mov_b32 s9, s13
 ; GFX90A-NEXT:    s_mov_b32 s10, s12
 ; GFX90A-NEXT:    s_mov_b32 s11, s13
 ; GFX90A-NEXT:    s_mov_b32 s14, s12
@@ -23340,44 +23958,35 @@ define void @s_shuffle_v4p0_v4p0__2_6_6_6() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__2_6_6_6:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__6_6_6_6:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s14, s12
-; GFX942-NEXT:    s_mov_b32 s15, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 2, i32 6, i32 6, i32 6>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 6, i32 6, i32 6, i32 6>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__3_6_6_6() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__3_6_6_6:
+define void @s_shuffle_v4p0_v4p0__7_6_6_6() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_6_6_6:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s18
-; GFX900-NEXT:    s_mov_b32 s9, s19
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
 ; GFX900-NEXT:    s_mov_b32 s10, s12
 ; GFX900-NEXT:    s_mov_b32 s11, s13
 ; GFX900-NEXT:    s_mov_b32 s14, s12
@@ -23387,17 +23996,14 @@ define void @s_shuffle_v4p0_v4p0__3_6_6_6() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__3_6_6_6:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_6_6_6:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s18
-; GFX90A-NEXT:    s_mov_b32 s9, s19
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
 ; GFX90A-NEXT:    s_mov_b32 s10, s12
 ; GFX90A-NEXT:    s_mov_b32 s11, s13
 ; GFX90A-NEXT:    s_mov_b32 s14, s12
@@ -23407,119 +24013,19 @@ define void @s_shuffle_v4p0_v4p0__3_6_6_6() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__3_6_6_6:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_6_6_6:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s14, s12
-; GFX942-NEXT:    s_mov_b32 s15, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 3, i32 6, i32 6, i32 6>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4p0_v4p0__4_6_6_6() {
-; GFX9-LABEL: s_shuffle_v4p0_v4p0__4_6_6_6:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    s_mov_b32 s14, s12
-; GFX9-NEXT:    s_mov_b32 s15, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 4, i32 6, i32 6, i32 6>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4p0_v4p0__5_6_6_6() {
-; GFX9-LABEL: s_shuffle_v4p0_v4p0__5_6_6_6:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    s_mov_b32 s14, s12
-; GFX9-NEXT:    s_mov_b32 s15, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 5, i32 6, i32 6, i32 6>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4p0_v4p0__6_6_6_6() {
-; GFX9-LABEL: s_shuffle_v4p0_v4p0__6_6_6_6:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s12
-; GFX9-NEXT:    s_mov_b32 s9, s13
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    s_mov_b32 s14, s12
-; GFX9-NEXT:    s_mov_b32 s15, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 6, i32 6, i32 6, i32 6>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4p0_v4p0__7_6_6_6() {
-; GFX9-LABEL: s_shuffle_v4p0_v4p0__7_6_6_6:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    s_mov_b32 s14, s12
-; GFX9-NEXT:    s_mov_b32 s15, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 6, i32 6, i32 6>
@@ -23528,20 +24034,48 @@ define void @s_shuffle_v4p0_v4p0__7_6_6_6() {
 }
 
 define void @s_shuffle_v4p0_v4p0__7_u_6_6() {
-; GFX9-LABEL: s_shuffle_v4p0_v4p0__7_u_6_6:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    s_mov_b32 s14, s12
-; GFX9-NEXT:    s_mov_b32 s15, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_u_6_6:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_u_6_6:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_u_6_6:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 poison, i32 6, i32 6>
@@ -23599,12 +24133,9 @@ define void @s_shuffle_v4p0_v4p0__7_0_6_6() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s14, s12
-; GFX942-NEXT:    s_mov_b32 s15, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -23666,12 +24197,9 @@ define void @s_shuffle_v4p0_v4p0__7_1_6_6() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s12, s4
-; GFX942-NEXT:    s_mov_b32 s13, s5
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -23733,12 +24261,9 @@ define void @s_shuffle_v4p0_v4p0__7_2_6_6() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s14, s12
-; GFX942-NEXT:    s_mov_b32 s15, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -23800,12 +24325,9 @@ define void @s_shuffle_v4p0_v4p0__7_3_6_6() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s14, s12
-; GFX942-NEXT:    s_mov_b32 s15, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -23862,14 +24384,10 @@ define void @s_shuffle_v4p0_v4p0__7_4_6_6() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s4
-; GFX942-NEXT:    s_mov_b32 s13, s5
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -23882,20 +24400,48 @@ define void @s_shuffle_v4p0_v4p0__7_4_6_6() {
 }
 
 define void @s_shuffle_v4p0_v4p0__7_5_6_6() {
-; GFX9-LABEL: s_shuffle_v4p0_v4p0__7_5_6_6:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    s_mov_b32 s14, s12
-; GFX9-NEXT:    s_mov_b32 s15, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_5_6_6:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_5_6_6:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_5_6_6:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 5, i32 6, i32 6>
@@ -23904,22 +24450,53 @@ define void @s_shuffle_v4p0_v4p0__7_5_6_6() {
 }
 
 define void @s_shuffle_v4p0_v4p0__7_7_6_6() {
-; GFX9-LABEL: s_shuffle_v4p0_v4p0__7_7_6_6:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    s_mov_b32 s10, s14
-; GFX9-NEXT:    s_mov_b32 s11, s15
-; GFX9-NEXT:    s_mov_b32 s14, s12
-; GFX9-NEXT:    s_mov_b32 s15, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_6_6:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_6_6:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_6_6:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 6, i32 6>
@@ -23968,12 +24545,9 @@ define void @s_shuffle_v4p0_v4p0__7_7_u_6() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -24039,14 +24613,10 @@ define void @s_shuffle_v4p0_v4p0__7_7_0_6() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s18
-; GFX942-NEXT:    s_mov_b32 s9, s19
-; GFX942-NEXT:    s_mov_b32 s10, s18
-; GFX942-NEXT:    s_mov_b32 s11, s19
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s16
-; GFX942-NEXT:    s_mov_b32 s15, s17
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[18:19]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[18:19]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[16:17]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -24112,14 +24682,10 @@ define void @s_shuffle_v4p0_v4p0__7_7_1_6() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s18
-; GFX942-NEXT:    s_mov_b32 s9, s19
-; GFX942-NEXT:    s_mov_b32 s10, s18
-; GFX942-NEXT:    s_mov_b32 s11, s19
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s16
-; GFX942-NEXT:    s_mov_b32 s15, s17
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[18:19]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[18:19]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[16:17]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -24181,12 +24747,9 @@ define void @s_shuffle_v4p0_v4p0__7_7_2_6() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -24252,14 +24815,10 @@ define void @s_shuffle_v4p0_v4p0__7_7_3_6() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s18
-; GFX942-NEXT:    s_mov_b32 s9, s19
-; GFX942-NEXT:    s_mov_b32 s10, s18
-; GFX942-NEXT:    s_mov_b32 s11, s19
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
-; GFX942-NEXT:    s_mov_b32 s14, s16
-; GFX942-NEXT:    s_mov_b32 s15, s17
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[18:19]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[18:19]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[16:17]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -24312,14 +24871,10 @@ define void @s_shuffle_v4p0_v4p0__7_7_4_6() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -24376,14 +24931,10 @@ define void @s_shuffle_v4p0_v4p0__7_7_5_6() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -24396,20 +24947,48 @@ define void @s_shuffle_v4p0_v4p0__7_7_5_6() {
 }
 
 define void @s_shuffle_v4p0_v4p0__u_7_7_7() {
-; GFX9-LABEL: s_shuffle_v4p0_v4p0__u_7_7_7:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s14
-; GFX9-NEXT:    s_mov_b32 s11, s15
-; GFX9-NEXT:    s_mov_b32 s12, s14
-; GFX9-NEXT:    s_mov_b32 s13, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__u_7_7_7:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__u_7_7_7:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__u_7_7_7:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 poison, i32 7, i32 7, i32 7>
@@ -24467,12 +25046,9 @@ define void @s_shuffle_v4p0_v4p0__0_7_7_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
-; GFX942-NEXT:    s_mov_b32 s14, s6
-; GFX942-NEXT:    s_mov_b32 s15, s7
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -24534,12 +25110,9 @@ define void @s_shuffle_v4p0_v4p0__1_7_7_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -24601,12 +25174,9 @@ define void @s_shuffle_v4p0_v4p0__2_7_7_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -24659,117 +25229,232 @@ define void @s_shuffle_v4p0_v4p0__3_7_7_7() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__3_7_7_7:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__3_7_7_7:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
+  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 3, i32 7, i32 7, i32 7>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4p0_v4p0__4_7_7_7() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__4_7_7_7:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__4_7_7_7:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__4_7_7_7:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
+  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 4, i32 7, i32 7, i32 7>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4p0_v4p0__5_7_7_7() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__5_7_7_7:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__5_7_7_7:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__5_7_7_7:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
+  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 5, i32 7, i32 7, i32 7>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4p0_v4p0__6_7_7_7() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__6_7_7_7:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s12
+; GFX900-NEXT:    s_mov_b32 s9, s13
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__6_7_7_7:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s12
+; GFX90A-NEXT:    s_mov_b32 s9, s13
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__6_7_7_7:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
+  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 6, i32 7, i32 7, i32 7>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4p0_v4p0__7_u_7_7() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_u_7_7:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_u_7_7:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_u_7_7:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 3, i32 7, i32 7, i32 7>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4p0_v4p0__4_7_7_7() {
-; GFX9-LABEL: s_shuffle_v4p0_v4p0__4_7_7_7:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s14
-; GFX9-NEXT:    s_mov_b32 s11, s15
-; GFX9-NEXT:    s_mov_b32 s12, s14
-; GFX9-NEXT:    s_mov_b32 s13, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 4, i32 7, i32 7, i32 7>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4p0_v4p0__5_7_7_7() {
-; GFX9-LABEL: s_shuffle_v4p0_v4p0__5_7_7_7:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    s_mov_b32 s10, s14
-; GFX9-NEXT:    s_mov_b32 s11, s15
-; GFX9-NEXT:    s_mov_b32 s12, s14
-; GFX9-NEXT:    s_mov_b32 s13, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 5, i32 7, i32 7, i32 7>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4p0_v4p0__6_7_7_7() {
-; GFX9-LABEL: s_shuffle_v4p0_v4p0__6_7_7_7:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s12
-; GFX9-NEXT:    s_mov_b32 s9, s13
-; GFX9-NEXT:    s_mov_b32 s10, s14
-; GFX9-NEXT:    s_mov_b32 s11, s15
-; GFX9-NEXT:    s_mov_b32 s12, s14
-; GFX9-NEXT:    s_mov_b32 s13, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 6, i32 7, i32 7, i32 7>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4p0_v4p0__7_u_7_7() {
-; GFX9-LABEL: s_shuffle_v4p0_v4p0__7_u_7_7:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    s_mov_b32 s12, s14
-; GFX9-NEXT:    s_mov_b32 s13, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 poison, i32 7, i32 7>
@@ -24827,12 +25512,9 @@ define void @s_shuffle_v4p0_v4p0__7_0_7_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -24894,12 +25576,9 @@ define void @s_shuffle_v4p0_v4p0__7_1_7_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
-; GFX942-NEXT:    s_mov_b32 s14, s6
-; GFX942-NEXT:    s_mov_b32 s15, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -24961,12 +25640,9 @@ define void @s_shuffle_v4p0_v4p0__7_2_7_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -25028,12 +25704,9 @@ define void @s_shuffle_v4p0_v4p0__7_3_7_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -25090,14 +25763,10 @@ define void @s_shuffle_v4p0_v4p0__7_4_7_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
-; GFX942-NEXT:    s_mov_b32 s14, s6
-; GFX942-NEXT:    s_mov_b32 s15, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -25110,20 +25779,48 @@ define void @s_shuffle_v4p0_v4p0__7_4_7_7() {
 }
 
 define void @s_shuffle_v4p0_v4p0__7_5_7_7() {
-; GFX9-LABEL: s_shuffle_v4p0_v4p0__7_5_7_7:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    s_mov_b32 s12, s14
-; GFX9-NEXT:    s_mov_b32 s13, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_5_7_7:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_5_7_7:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_5_7_7:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 5, i32 7, i32 7>
@@ -25132,22 +25829,53 @@ define void @s_shuffle_v4p0_v4p0__7_5_7_7() {
 }
 
 define void @s_shuffle_v4p0_v4p0__7_6_7_7() {
-; GFX9-LABEL: s_shuffle_v4p0_v4p0__7_6_7_7:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    s_mov_b32 s12, s14
-; GFX9-NEXT:    s_mov_b32 s13, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_6_7_7:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_6_7_7:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_6_7_7:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 6, i32 7, i32 7>
@@ -25156,20 +25884,48 @@ define void @s_shuffle_v4p0_v4p0__7_6_7_7() {
 }
 
 define void @s_shuffle_v4p0_v4p0__7_7_u_7() {
-; GFX9-LABEL: s_shuffle_v4p0_v4p0__7_7_u_7:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    s_mov_b32 s10, s14
-; GFX9-NEXT:    s_mov_b32 s11, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_u_7:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_u_7:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_u_7:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 poison, i32 7>
@@ -25227,12 +25983,9 @@ define void @s_shuffle_v4p0_v4p0__7_7_0_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -25294,12 +26047,9 @@ define void @s_shuffle_v4p0_v4p0__7_7_1_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -25361,12 +26111,9 @@ define void @s_shuffle_v4p0_v4p0__7_7_2_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s14, s6
-; GFX942-NEXT:    s_mov_b32 s15, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -25428,12 +26175,9 @@ define void @s_shuffle_v4p0_v4p0__7_7_3_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -25486,14 +26230,10 @@ define void @s_shuffle_v4p0_v4p0__7_7_4_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s6
-; GFX942-NEXT:    s_mov_b32 s15, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -25550,14 +26290,10 @@ define void @s_shuffle_v4p0_v4p0__7_7_5_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s6
-; GFX942-NEXT:    s_mov_b32 s15, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -25570,20 +26306,48 @@ define void @s_shuffle_v4p0_v4p0__7_7_5_7() {
 }
 
 define void @s_shuffle_v4p0_v4p0__7_7_6_7() {
-; GFX9-LABEL: s_shuffle_v4p0_v4p0__7_7_6_7:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    s_mov_b32 s10, s14
-; GFX9-NEXT:    s_mov_b32 s11, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_6_7:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_6_7:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_6_7:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 6, i32 7>
diff --git a/llvm/test/CodeGen/AMDGPU/smfmac_no_agprs.ll b/llvm/test/CodeGen/AMDGPU/smfmac_no_agprs.ll
index d8c015b85584a..f3a9b665e2d99 100644
--- a/llvm/test/CodeGen/AMDGPU/smfmac_no_agprs.ll
+++ b/llvm/test/CodeGen/AMDGPU/smfmac_no_agprs.ll
@@ -6,25 +6,22 @@ define protected amdgpu_kernel void @test(ptr addrspace(1) %in, ptr addrspace(1)
 ; GFX942-LABEL: test:
 ; GFX942:       ; %bb.0: ; %entry
 ; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX942-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v0
-; GFX942-NEXT:    v_mov_b32_e32 v3, v0
-; GFX942-NEXT:    v_mov_b32_e32 v1, v0
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], 0
+; GFX942-NEXT:    v_mov_b64_e32 v[6:7], v[4:5]
+; GFX942-NEXT:    v_mov_b32_e32 v10, 0
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x0
-; GFX942-NEXT:    v_mov_b64_e32 v[10:11], v[2:3]
-; GFX942-NEXT:    v_mov_b64_e32 v[8:9], v[0:1]
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b32_e32 v12, s4
-; GFX942-NEXT:    v_mov_b32_e32 v13, s5
-; GFX942-NEXT:    v_mov_b32_e32 v4, s6
-; GFX942-NEXT:    v_mov_b32_e32 v5, s7
-; GFX942-NEXT:    v_mov_b32_e32 v6, s7
-; GFX942-NEXT:    v_mov_b32_e32 v7, s7
+; GFX942-NEXT:    v_mov_b32_e32 v8, s4
+; GFX942-NEXT:    v_mov_b32_e32 v9, s5
+; GFX942-NEXT:    v_mov_b32_e32 v0, s6
+; GFX942-NEXT:    v_mov_b32_e32 v1, s7
+; GFX942-NEXT:    v_mov_b32_e32 v2, s7
+; GFX942-NEXT:    v_mov_b32_e32 v3, s7
 ; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_smfmac_i32_16x16x64_i8 v[8:11], v[12:13], v[4:7], v13
+; GFX942-NEXT:    v_smfmac_i32_16x16x64_i8 v[4:7], v[8:9], v[0:3], v9
 ; GFX942-NEXT:    s_nop 6
-; GFX942-NEXT:    global_store_dword v0, v11, s[2:3] offset:12
+; GFX942-NEXT:    global_store_dword v10, v7, s[2:3] offset:12
 ; GFX942-NEXT:    s_endpgm
 entry:
   %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 0
diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-fmax.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-fmax.ll
index d500a3e50f9f7..995ebc822f769 100644
--- a/llvm/test/CodeGen/AMDGPU/vector-reduce-fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-fmax.ll
@@ -229,9 +229,10 @@ define half @test_vector_reduce_fmax_v3half(<3 x half> %v) {
 ; GFX9-SDAG-LABEL: test_vector_reduce_fmax_v3half:
 ; GFX9-SDAG:       ; %bb.0: ; %entry
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX9-SDAG-NEXT:    s_movk_i32 s0, 0x7e00
-; GFX9-SDAG-NEXT:    v_pack_b32_f16 v1, v1, s0
+; GFX9-SDAG-NEXT:    s_movk_i32 s0, 0xfe00
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, 0x5040100
+; GFX9-SDAG-NEXT:    v_perm_b32 v1, s0, v1, v2
+; GFX9-SDAG-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX9-SDAG-NEXT:    v_pk_max_f16 v0, v0, v0
 ; GFX9-SDAG-NEXT:    s_nop 0
 ; GFX9-SDAG-NEXT:    v_pk_max_f16 v0, v0, v1
diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-fmin.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-fmin.ll
index 56e7e045e40eb..fe06485749651 100644
--- a/llvm/test/CodeGen/AMDGPU/vector-reduce-fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-fmin.ll
@@ -229,9 +229,10 @@ define half @test_vector_reduce_fmin_v3half(<3 x half> %v) {
 ; GFX9-SDAG-LABEL: test_vector_reduce_fmin_v3half:
 ; GFX9-SDAG:       ; %bb.0: ; %entry
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_max_f16_e32 v1, v1, v1
 ; GFX9-SDAG-NEXT:    s_movk_i32 s0, 0x7e00
-; GFX9-SDAG-NEXT:    v_pack_b32_f16 v1, v1, s0
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, 0x5040100
+; GFX9-SDAG-NEXT:    v_perm_b32 v1, s0, v1, v2
+; GFX9-SDAG-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX9-SDAG-NEXT:    v_pk_max_f16 v0, v0, v0
 ; GFX9-SDAG-NEXT:    s_nop 0
 ; GFX9-SDAG-NEXT:    v_pk_min_f16 v0, v0, v1
diff --git a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
index 310f3ad04917b..80faea8f7cc46 100644
--- a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
+++ b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
@@ -459,11 +459,10 @@ define amdgpu_kernel void @v8i8_phi_zeroinit(ptr addrspace(1) %src1, ptr addrspa
 ; GFX942-NEXT:  ; %bb.1: ; %bb.1
 ; GFX942-NEXT:    global_load_dwordx2 v[2:3], v5, s[10:11]
 ; GFX942-NEXT:    v_cmp_gt_u32_e32 vcc, 7, v4
-; GFX942-NEXT:    s_waitcnt vmcnt(1)
-; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
 ; GFX942-NEXT:    s_and_b64 s[4:5], vcc, exec
-; GFX942-NEXT:    v_mov_b32_e32 v1, v0
+; GFX942-NEXT:    s_waitcnt vmcnt(1)
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], 0
 ; GFX942-NEXT:    s_or_b64 s[0:1], s[0:1], s[4:5]
 ; GFX942-NEXT:  .LBB9_2: ; %Flow
 ; GFX942-NEXT:    s_or_b64 exec, exec, s[2:3]

>From 48b8fee7433e5201af90919503c240f5e0d3bbdb Mon Sep 17 00:00:00 2001
From: Janek van Oirschot <Janek.vanOirschot at amd.com>
Date: Wed, 25 Jun 2025 04:26:41 -0700
Subject: [PATCH 2/6] Fix undef propagation, should avoid storing undef values

---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     | 33 ++++++++++++
 .../AMDGPU/shufflevector.v3i64.v2i64.ll       | 22 +++-----
 .../AMDGPU/shufflevector.v3i64.v3i64.ll       | 24 +++------
 .../AMDGPU/shufflevector.v3i64.v4i64.ll       | 30 +++--------
 .../CodeGen/AMDGPU/shufflevector.v3p0.v2p0.ll | 22 +++-----
 .../CodeGen/AMDGPU/shufflevector.v3p0.v3p0.ll | 24 +++------
 .../CodeGen/AMDGPU/shufflevector.v3p0.v4p0.ll | 30 +++--------
 .../AMDGPU/shufflevector.v4i64.v2i64.ll       | 28 +++-------
 .../AMDGPU/shufflevector.v4i64.v3i64.ll       | 43 +++++----------
 .../AMDGPU/shufflevector.v4i64.v4i64.ll       | 52 ++++++-------------
 .../CodeGen/AMDGPU/shufflevector.v4p0.v2p0.ll | 28 +++-------
 .../CodeGen/AMDGPU/shufflevector.v4p0.v3p0.ll | 43 +++++----------
 .../CodeGen/AMDGPU/shufflevector.v4p0.v4p0.ll | 52 ++++++-------------
 13 files changed, 153 insertions(+), 278 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 6390bc79427df..88550ee77a9c9 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -14174,6 +14174,18 @@ bool SITargetLowering::shouldExpandVectorDynExt(SDNode *N) const {
       EltSize, NumElem, Idx->isDivergent(), getSubtarget());
 }
 
+static unsigned getMappedVectorIndex(unsigned Idx, EVT From, EVT To) {
+  assert(From.isVector() && To.isVector() &&
+         "From and To types have to be vector types");
+  assert(From.getSizeInBits() == To.getSizeInBits() &&
+         "From and To vector types require the same size");
+
+  unsigned FromNumElts = From.getVectorNumElements();
+  unsigned ToNumElts = To.getVectorNumElements();
+
+  return (Idx * ToNumElts) / FromNumElts;
+}
+
 SDValue
 SITargetLowering::performExtractVectorEltCombine(SDNode *N,
                                                  DAGCombinerInfo &DCI) const {
@@ -14236,6 +14248,27 @@ SITargetLowering::performExtractVectorEltCombine(SDNode *N,
     }
   }
 
+  // if PeekThoughBitcast(Vec)[MapIdx(CIdx)] == undef &&
+  //    VecEltSize < PeekThroughEltSize, then
+  // EXTRACT_VECTOR_ELT(bitcast(build_vector(..., undef, ...)), CIdx) => undef
+  auto *IndexC = dyn_cast<ConstantSDNode>(N->getOperand(1));
+  SDValue PeekThroughVec = peekThroughBitcasts(Vec);
+  EVT PeekThroughVecVT = PeekThroughVec.getValueType();
+  if (IndexC && PeekThroughVec.getOpcode() == ISD::BUILD_VECTOR &&
+      PeekThroughVecVT.isFixedLengthVector()) {
+    EVT PeekThroughVecEltVT = PeekThroughVecVT.getVectorElementType();
+    // Small elt size vectors to big elt size vectors are the cases covered for
+    // now (e.g., v4i32 bitcast(v2i64)) which may be conservative.
+    if (VecEltSize < PeekThroughVecEltVT.getSizeInBits()) {
+      unsigned IndexVal = IndexC->getZExtValue();
+      unsigned MappedIndexVal =
+          getMappedVectorIndex(IndexVal, VecVT, PeekThroughVecVT);
+      SDValue PeekThroughElt = PeekThroughVec.getOperand(MappedIndexVal);
+      if (PeekThroughElt.isUndef())
+        return DAG.getUNDEF(VecEltVT);
+    }
+  }
+
   // EXTRACT_VECTOR_ELT (<n x e>, var-idx) => n x select (e, const-idx)
   if (shouldExpandVectorDynExt(N)) {
     SDLoc SL(N);
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3i64.v2i64.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3i64.v2i64.ll
index 1b3cf9c6eba0f..261257533208b 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3i64.v2i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3i64.v2i64.ll
@@ -45,7 +45,6 @@ define void @v_shuffle_v3i64_v2i64__0_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -85,11 +84,10 @@ define void @v_shuffle_v3i64_v2i64__1_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v3i64_v2i64__1_u_u:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1] offset:16
+; GFX942-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX942-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX942-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX942-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
@@ -142,11 +140,10 @@ define void @v_shuffle_v3i64_v2i64__3_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v3i64_v2i64__3_u_u:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1] offset:16
+; GFX942-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX942-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX942-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX942-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
@@ -206,7 +203,7 @@ define void @v_shuffle_v3i64_v2i64__3_0_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[2:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    global_store_dwordx2 v6, v[0:1], s[0:1] offset:16
+; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX942-NEXT:    v_mov_b32_e32 v3, v5
 ; GFX942-NEXT:    v_mov_b32_e32 v4, v0
@@ -257,14 +254,14 @@ define void @v_shuffle_v3i64_v2i64__3_1_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v3i64_v2i64__3_1_u:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[4:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    global_store_dwordx2 v8, v[0:1], s[0:1] offset:16
+; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    v_mov_b32_e32 v0, v6
 ; GFX942-NEXT:    v_mov_b32_e32 v1, v7
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
@@ -311,7 +308,7 @@ define void @v_shuffle_v3i64_v2i64__3_2_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    global_store_dwordx2 v6, v[0:1], s[0:1] offset:16
+; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX942-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[2:5], s[0:1]
@@ -354,11 +351,10 @@ define void @v_shuffle_v3i64_v2i64__3_3_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v3i64_v2i64__3_3_u:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1] offset:16
+; GFX942-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX942-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX942-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX942-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
@@ -1476,7 +1472,6 @@ define void @v_shuffle_v3i64_v2i64__0_2_2(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1516,11 +1511,10 @@ define void @v_shuffle_v3i64_v2i64__1_2_2(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v3i64_v2i64__1_2_2:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1] offset:16
+; GFX942-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX942-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX942-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX942-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3i64.v3i64.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3i64.v3i64.ll
index 0ff4daf6feb33..3c546bf8a3130 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3i64.v3i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3i64.v3i64.ll
@@ -87,7 +87,6 @@ define void @v_shuffle_v3i64_v3i64__1_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    global_store_dwordx2 v6, v[0:1], s[0:1] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -127,11 +126,10 @@ define void @v_shuffle_v3i64_v3i64__2_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v3i64_v3i64__2_u_u:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    global_store_dwordx2 v6, v[0:1], s[0:1] offset:16
+; GFX942-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX942-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX942-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1]
@@ -184,7 +182,6 @@ define void @v_shuffle_v3i64_v3i64__4_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    global_store_dwordx2 v6, v[0:1], s[0:1] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -225,11 +222,10 @@ define void @v_shuffle_v3i64_v3i64__5_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v3i64_v3i64__5_u_u:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    global_store_dwordx2 v6, v[0:1], s[0:1] offset:16
+; GFX942-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX942-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX942-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1]
@@ -289,7 +285,7 @@ define void @v_shuffle_v3i64_v3i64__5_0_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[2:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    global_store_dwordx2 v8, v[0:1], s[0:1] offset:16
+; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    v_mov_b32_e32 v2, v6
 ; GFX942-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX942-NEXT:    v_mov_b32_e32 v4, v0
@@ -340,11 +336,10 @@ define void @v_shuffle_v3i64_v3i64__5_1_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v3i64_v3i64__5_1_u:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b32_e32 v10, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    global_store_dwordx2 v10, v[0:1], s[0:1] offset:16
+; GFX942-NEXT:    v_mov_b32_e32 v10, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[4:9]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -404,7 +399,7 @@ define void @v_shuffle_v3i64_v3i64__5_2_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[6:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    global_store_dwordx2 v12, v[0:1], s[0:1] offset:16
+; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    v_mov_b32_e32 v2, v10
 ; GFX942-NEXT:    v_mov_b32_e32 v3, v11
 ; GFX942-NEXT:    global_store_dwordx4 v12, v[2:5], s[0:1]
@@ -459,7 +454,6 @@ define void @v_shuffle_v3i64_v3i64__5_3_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    v_mov_b32_e32 v3, v5
 ; GFX942-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX942-NEXT:    v_mov_b32_e32 v5, v1
-; GFX942-NEXT:    global_store_dwordx2 v6, v[0:1], s[0:1] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -500,11 +494,10 @@ define void @v_shuffle_v3i64_v3i64__5_4_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v3i64_v3i64__5_4_u:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    global_store_dwordx2 v6, v[0:1], s[0:1] offset:16
+; GFX942-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX942-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX942-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1]
@@ -553,7 +546,6 @@ define void @v_shuffle_v3i64_v3i64__5_5_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX942-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX942-NEXT:    v_mov_b32_e32 v3, v5
-; GFX942-NEXT:    global_store_dwordx2 v6, v[0:1], s[0:1] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -2910,7 +2902,6 @@ define void @v_shuffle_v3i64_v3i64__1_3_3(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    global_store_dwordx2 v6, v[0:1], s[0:1] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -2950,11 +2941,10 @@ define void @v_shuffle_v3i64_v3i64__2_3_3(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v3i64_v3i64__2_3_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    global_store_dwordx2 v6, v[0:1], s[0:1] offset:16
+; GFX942-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX942-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX942-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3i64.v4i64.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3i64.v4i64.ll
index 58f1e346b6503..7815761d29696 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3i64.v4i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3i64.v4i64.ll
@@ -87,7 +87,6 @@ define void @v_shuffle_v3i64_v4i64__1_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    global_store_dwordx2 v8, v[0:1], s[0:1] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -127,7 +126,6 @@ define void @v_shuffle_v3i64_v4i64__2_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    global_store_dwordx2 v8, v[0:1], s[0:1] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -167,11 +165,10 @@ define void @v_shuffle_v3i64_v4i64__3_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v3i64_v4i64__3_u_u:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    global_store_dwordx2 v8, v[0:1], s[0:1] offset:16
+; GFX942-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-NEXT:    v_mov_b32_e32 v0, v6
 ; GFX942-NEXT:    v_mov_b32_e32 v1, v7
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
@@ -224,7 +221,6 @@ define void @v_shuffle_v3i64_v4i64__5_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    global_store_dwordx2 v8, v[0:1], s[0:1] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -265,7 +261,6 @@ define void @v_shuffle_v3i64_v4i64__6_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    global_store_dwordx2 v8, v[0:1], s[0:1] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -306,11 +301,10 @@ define void @v_shuffle_v3i64_v4i64__7_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v3i64_v4i64__7_u_u:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    global_store_dwordx2 v8, v[0:1], s[0:1] offset:16
+; GFX942-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-NEXT:    v_mov_b32_e32 v0, v6
 ; GFX942-NEXT:    v_mov_b32_e32 v1, v7
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
@@ -370,7 +364,7 @@ define void @v_shuffle_v3i64_v4i64__7_0_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[2:9]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    global_store_dwordx2 v10, v[0:1], s[0:1] offset:16
+; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    v_mov_b32_e32 v2, v8
 ; GFX942-NEXT:    v_mov_b32_e32 v3, v9
 ; GFX942-NEXT:    v_mov_b32_e32 v4, v0
@@ -421,11 +415,10 @@ define void @v_shuffle_v3i64_v4i64__7_1_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v3i64_v4i64__7_1_u:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b32_e32 v12, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    global_store_dwordx2 v12, v[0:1], s[0:1] offset:16
+; GFX942-NEXT:    v_mov_b32_e32 v12, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -485,7 +478,7 @@ define void @v_shuffle_v3i64_v4i64__7_2_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[6:13]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    global_store_dwordx2 v14, v[0:1], s[0:1] offset:16
+; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    v_mov_b32_e32 v2, v12
 ; GFX942-NEXT:    v_mov_b32_e32 v3, v13
 ; GFX942-NEXT:    global_store_dwordx4 v14, v[2:5], s[0:1]
@@ -541,7 +534,7 @@ define void @v_shuffle_v3i64_v4i64__7_3_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    global_store_dwordx2 v16, v[0:1], s[0:1] offset:16
+; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    v_mov_b32_e32 v4, v14
 ; GFX942-NEXT:    v_mov_b32_e32 v5, v15
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1]
@@ -596,7 +589,6 @@ define void @v_shuffle_v3i64_v4i64__7_4_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX942-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX942-NEXT:    v_mov_b32_e32 v5, v1
-; GFX942-NEXT:    global_store_dwordx2 v8, v[0:1], s[0:1] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -637,11 +629,10 @@ define void @v_shuffle_v3i64_v4i64__7_5_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v3i64_v4i64__7_5_u:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    global_store_dwordx2 v8, v[0:1], s[0:1] offset:16
+; GFX942-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-NEXT:    v_mov_b32_e32 v0, v6
 ; GFX942-NEXT:    v_mov_b32_e32 v1, v7
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
@@ -690,7 +681,6 @@ define void @v_shuffle_v3i64_v4i64__7_6_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-NEXT:    v_mov_b32_e32 v2, v6
 ; GFX942-NEXT:    v_mov_b32_e32 v3, v7
-; GFX942-NEXT:    global_store_dwordx2 v8, v[0:1], s[0:1] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -737,7 +727,6 @@ define void @v_shuffle_v3i64_v4i64__7_7_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-NEXT:    v_mov_b32_e32 v4, v6
 ; GFX942-NEXT:    v_mov_b32_e32 v5, v7
-; GFX942-NEXT:    global_store_dwordx2 v8, v[0:1], s[0:1] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -4730,7 +4719,6 @@ define void @v_shuffle_v3i64_v4i64__1_4_4(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    global_store_dwordx2 v8, v[0:1], s[0:1] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -4770,7 +4758,6 @@ define void @v_shuffle_v3i64_v4i64__2_4_4(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    global_store_dwordx2 v8, v[0:1], s[0:1] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -4810,11 +4797,10 @@ define void @v_shuffle_v3i64_v4i64__3_4_4(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v3i64_v4i64__3_4_4:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    global_store_dwordx2 v8, v[0:1], s[0:1] offset:16
+; GFX942-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-NEXT:    v_mov_b32_e32 v0, v6
 ; GFX942-NEXT:    v_mov_b32_e32 v1, v7
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3p0.v2p0.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3p0.v2p0.ll
index 9d557a317986d..fdba1e81224d0 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3p0.v2p0.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3p0.v2p0.ll
@@ -45,7 +45,6 @@ define void @v_shuffle_v3p0_v2p0__0_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -85,11 +84,10 @@ define void @v_shuffle_v3p0_v2p0__1_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v3p0_v2p0__1_u_u:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1] offset:16
+; GFX942-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX942-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX942-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX942-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
@@ -142,11 +140,10 @@ define void @v_shuffle_v3p0_v2p0__3_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v3p0_v2p0__3_u_u:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1] offset:16
+; GFX942-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX942-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX942-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX942-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
@@ -206,7 +203,7 @@ define void @v_shuffle_v3p0_v2p0__3_0_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[2:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    global_store_dwordx2 v6, v[0:1], s[0:1] offset:16
+; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX942-NEXT:    v_mov_b32_e32 v3, v5
 ; GFX942-NEXT:    v_mov_b32_e32 v4, v0
@@ -257,14 +254,14 @@ define void @v_shuffle_v3p0_v2p0__3_1_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v3p0_v2p0__3_1_u:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[4:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    global_store_dwordx2 v8, v[0:1], s[0:1] offset:16
+; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    v_mov_b32_e32 v0, v6
 ; GFX942-NEXT:    v_mov_b32_e32 v1, v7
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
@@ -311,7 +308,7 @@ define void @v_shuffle_v3p0_v2p0__3_2_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    global_store_dwordx2 v6, v[0:1], s[0:1] offset:16
+; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX942-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[2:5], s[0:1]
@@ -354,11 +351,10 @@ define void @v_shuffle_v3p0_v2p0__3_3_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v3p0_v2p0__3_3_u:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1] offset:16
+; GFX942-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX942-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX942-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX942-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
@@ -1476,7 +1472,6 @@ define void @v_shuffle_v3p0_v2p0__0_2_2(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1516,11 +1511,10 @@ define void @v_shuffle_v3p0_v2p0__1_2_2(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v3p0_v2p0__1_2_2:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1] offset:16
+; GFX942-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX942-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX942-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX942-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3p0.v3p0.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3p0.v3p0.ll
index 84cc0535469f3..422bcb5b4414a 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3p0.v3p0.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3p0.v3p0.ll
@@ -87,7 +87,6 @@ define void @v_shuffle_v3p0_v3p0__1_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    global_store_dwordx2 v6, v[0:1], s[0:1] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -127,11 +126,10 @@ define void @v_shuffle_v3p0_v3p0__2_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v3p0_v3p0__2_u_u:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    global_store_dwordx2 v6, v[0:1], s[0:1] offset:16
+; GFX942-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX942-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX942-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1]
@@ -184,7 +182,6 @@ define void @v_shuffle_v3p0_v3p0__4_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    global_store_dwordx2 v6, v[0:1], s[0:1] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -225,11 +222,10 @@ define void @v_shuffle_v3p0_v3p0__5_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v3p0_v3p0__5_u_u:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    global_store_dwordx2 v6, v[0:1], s[0:1] offset:16
+; GFX942-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX942-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX942-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1]
@@ -289,7 +285,7 @@ define void @v_shuffle_v3p0_v3p0__5_0_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[2:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    global_store_dwordx2 v8, v[0:1], s[0:1] offset:16
+; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    v_mov_b32_e32 v2, v6
 ; GFX942-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX942-NEXT:    v_mov_b32_e32 v4, v0
@@ -340,11 +336,10 @@ define void @v_shuffle_v3p0_v3p0__5_1_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v3p0_v3p0__5_1_u:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b32_e32 v10, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    global_store_dwordx2 v10, v[0:1], s[0:1] offset:16
+; GFX942-NEXT:    v_mov_b32_e32 v10, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[4:9]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -404,7 +399,7 @@ define void @v_shuffle_v3p0_v3p0__5_2_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[6:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    global_store_dwordx2 v12, v[0:1], s[0:1] offset:16
+; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    v_mov_b32_e32 v2, v10
 ; GFX942-NEXT:    v_mov_b32_e32 v3, v11
 ; GFX942-NEXT:    global_store_dwordx4 v12, v[2:5], s[0:1]
@@ -459,7 +454,6 @@ define void @v_shuffle_v3p0_v3p0__5_3_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    v_mov_b32_e32 v3, v5
 ; GFX942-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX942-NEXT:    v_mov_b32_e32 v5, v1
-; GFX942-NEXT:    global_store_dwordx2 v6, v[0:1], s[0:1] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -500,11 +494,10 @@ define void @v_shuffle_v3p0_v3p0__5_4_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v3p0_v3p0__5_4_u:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    global_store_dwordx2 v6, v[0:1], s[0:1] offset:16
+; GFX942-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX942-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX942-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1]
@@ -553,7 +546,6 @@ define void @v_shuffle_v3p0_v3p0__5_5_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX942-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX942-NEXT:    v_mov_b32_e32 v3, v5
-; GFX942-NEXT:    global_store_dwordx2 v6, v[0:1], s[0:1] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -2910,7 +2902,6 @@ define void @v_shuffle_v3p0_v3p0__1_3_3(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    global_store_dwordx2 v6, v[0:1], s[0:1] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -2950,11 +2941,10 @@ define void @v_shuffle_v3p0_v3p0__2_3_3(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v3p0_v3p0__2_3_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    global_store_dwordx2 v6, v[0:1], s[0:1] offset:16
+; GFX942-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX942-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX942-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3p0.v4p0.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3p0.v4p0.ll
index 3e9a69b3de166..707633944e851 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3p0.v4p0.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3p0.v4p0.ll
@@ -87,7 +87,6 @@ define void @v_shuffle_v3p0_v4p0__1_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    global_store_dwordx2 v8, v[0:1], s[0:1] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -127,7 +126,6 @@ define void @v_shuffle_v3p0_v4p0__2_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    global_store_dwordx2 v8, v[0:1], s[0:1] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -167,11 +165,10 @@ define void @v_shuffle_v3p0_v4p0__3_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v3p0_v4p0__3_u_u:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    global_store_dwordx2 v8, v[0:1], s[0:1] offset:16
+; GFX942-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-NEXT:    v_mov_b32_e32 v0, v6
 ; GFX942-NEXT:    v_mov_b32_e32 v1, v7
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
@@ -224,7 +221,6 @@ define void @v_shuffle_v3p0_v4p0__5_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    global_store_dwordx2 v8, v[0:1], s[0:1] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -265,7 +261,6 @@ define void @v_shuffle_v3p0_v4p0__6_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    global_store_dwordx2 v8, v[0:1], s[0:1] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -306,11 +301,10 @@ define void @v_shuffle_v3p0_v4p0__7_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v3p0_v4p0__7_u_u:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    global_store_dwordx2 v8, v[0:1], s[0:1] offset:16
+; GFX942-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-NEXT:    v_mov_b32_e32 v0, v6
 ; GFX942-NEXT:    v_mov_b32_e32 v1, v7
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
@@ -370,7 +364,7 @@ define void @v_shuffle_v3p0_v4p0__7_0_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[2:9]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    global_store_dwordx2 v10, v[0:1], s[0:1] offset:16
+; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    v_mov_b32_e32 v2, v8
 ; GFX942-NEXT:    v_mov_b32_e32 v3, v9
 ; GFX942-NEXT:    v_mov_b32_e32 v4, v0
@@ -421,11 +415,10 @@ define void @v_shuffle_v3p0_v4p0__7_1_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v3p0_v4p0__7_1_u:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b32_e32 v12, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    global_store_dwordx2 v12, v[0:1], s[0:1] offset:16
+; GFX942-NEXT:    v_mov_b32_e32 v12, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -485,7 +478,7 @@ define void @v_shuffle_v3p0_v4p0__7_2_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[6:13]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    global_store_dwordx2 v14, v[0:1], s[0:1] offset:16
+; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    v_mov_b32_e32 v2, v12
 ; GFX942-NEXT:    v_mov_b32_e32 v3, v13
 ; GFX942-NEXT:    global_store_dwordx4 v14, v[2:5], s[0:1]
@@ -541,7 +534,7 @@ define void @v_shuffle_v3p0_v4p0__7_3_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    global_store_dwordx2 v16, v[0:1], s[0:1] offset:16
+; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    v_mov_b32_e32 v4, v14
 ; GFX942-NEXT:    v_mov_b32_e32 v5, v15
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1]
@@ -596,7 +589,6 @@ define void @v_shuffle_v3p0_v4p0__7_4_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX942-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX942-NEXT:    v_mov_b32_e32 v5, v1
-; GFX942-NEXT:    global_store_dwordx2 v8, v[0:1], s[0:1] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -637,11 +629,10 @@ define void @v_shuffle_v3p0_v4p0__7_5_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v3p0_v4p0__7_5_u:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    global_store_dwordx2 v8, v[0:1], s[0:1] offset:16
+; GFX942-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-NEXT:    v_mov_b32_e32 v0, v6
 ; GFX942-NEXT:    v_mov_b32_e32 v1, v7
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
@@ -690,7 +681,6 @@ define void @v_shuffle_v3p0_v4p0__7_6_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-NEXT:    v_mov_b32_e32 v2, v6
 ; GFX942-NEXT:    v_mov_b32_e32 v3, v7
-; GFX942-NEXT:    global_store_dwordx2 v8, v[0:1], s[0:1] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -737,7 +727,6 @@ define void @v_shuffle_v3p0_v4p0__7_7_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-NEXT:    v_mov_b32_e32 v4, v6
 ; GFX942-NEXT:    v_mov_b32_e32 v5, v7
-; GFX942-NEXT:    global_store_dwordx2 v8, v[0:1], s[0:1] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -4730,7 +4719,6 @@ define void @v_shuffle_v3p0_v4p0__1_4_4(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    global_store_dwordx2 v8, v[0:1], s[0:1] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -4770,7 +4758,6 @@ define void @v_shuffle_v3p0_v4p0__2_4_4(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    global_store_dwordx2 v8, v[0:1], s[0:1] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -4810,11 +4797,10 @@ define void @v_shuffle_v3p0_v4p0__3_4_4(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v3p0_v4p0__3_4_4:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    global_store_dwordx2 v8, v[0:1], s[0:1] offset:16
+; GFX942-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-NEXT:    v_mov_b32_e32 v0, v6
 ; GFX942-NEXT:    v_mov_b32_e32 v1, v7
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v2i64.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v2i64.ll
index f2538521727d1..3dc06c075b039 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v2i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v2i64.ll
@@ -45,7 +45,6 @@ define void @v_shuffle_v4i64_v2i64__0_u_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -85,12 +84,10 @@ define void @v_shuffle_v4i64_v2i64__1_u_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v4i64_v2i64__1_u_u_u:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
-; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX942-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX942-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX942-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
@@ -143,12 +140,10 @@ define void @v_shuffle_v4i64_v2i64__3_u_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v4i64_v2i64__3_u_u_u:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
-; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX942-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX942-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX942-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
@@ -208,8 +203,7 @@ define void @v_shuffle_v4i64_v2i64__3_0_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[2:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1] offset:16
-; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX942-NEXT:    v_mov_b32_e32 v3, v5
 ; GFX942-NEXT:    v_mov_b32_e32 v4, v0
@@ -260,15 +254,14 @@ define void @v_shuffle_v4i64_v2i64__3_1_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v4i64_v2i64__3_1_u_u:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[4:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1] offset:16
-; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    v_mov_b32_e32 v0, v6
 ; GFX942-NEXT:    v_mov_b32_e32 v1, v7
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
@@ -315,7 +308,7 @@ define void @v_shuffle_v4i64_v2i64__3_2_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1] offset:16
+; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX942-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[2:5], s[0:1]
@@ -358,12 +351,10 @@ define void @v_shuffle_v4i64_v2i64__3_3_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v4i64_v2i64__3_3_u_u:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
-; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX942-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX942-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX942-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
@@ -2371,7 +2362,6 @@ define void @v_shuffle_v4i64_v2i64__0_2_2_2(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -2411,12 +2401,10 @@ define void @v_shuffle_v4i64_v2i64__1_2_2_2(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v4i64_v2i64__1_2_2_2:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
-; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX942-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX942-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX942-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v3i64.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v3i64.ll
index d4f80eab1e28e..1a295a4c6e8ed 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v3i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v3i64.ll
@@ -87,7 +87,6 @@ define void @v_shuffle_v4i64_v3i64__1_u_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -127,12 +126,10 @@ define void @v_shuffle_v4i64_v3i64__2_u_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v4i64_v3i64__2_u_u_u:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1] offset:16
-; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX942-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX942-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1]
@@ -185,7 +182,6 @@ define void @v_shuffle_v4i64_v3i64__4_u_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -226,12 +222,10 @@ define void @v_shuffle_v4i64_v3i64__5_u_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v4i64_v3i64__5_u_u_u:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1] offset:16
-; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX942-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX942-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1]
@@ -291,11 +285,11 @@ define void @v_shuffle_v4i64_v3i64__5_0_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[2:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1] offset:16
-; GFX942-NEXT:    v_mov_b32_e32 v4, v0
-; GFX942-NEXT:    v_mov_b32_e32 v5, v1
+; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    v_mov_b32_e32 v2, v6
 ; GFX942-NEXT:    v_mov_b32_e32 v3, v7
+; GFX942-NEXT:    v_mov_b32_e32 v4, v0
+; GFX942-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -342,15 +336,14 @@ define void @v_shuffle_v4i64_v3i64__5_1_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v4i64_v3i64__5_1_u_u:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b32_e32 v10, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    global_store_dwordx4 v10, v[0:3], s[0:1] offset:16
+; GFX942-NEXT:    v_mov_b32_e32 v10, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[4:9]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    v_mov_b32_e32 v0, v8
 ; GFX942-NEXT:    v_mov_b32_e32 v1, v9
 ; GFX942-NEXT:    global_store_dwordx4 v10, v[0:3], s[0:1]
@@ -399,15 +392,14 @@ define void @v_shuffle_v4i64_v3i64__5_2_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v4i64_v3i64__5_2_u_u:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b32_e32 v12, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    v_mov_b32_e32 v12, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[6:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    global_store_dwordx4 v12, v[0:3], s[0:1] offset:16
-; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    v_mov_b32_e32 v2, v10
 ; GFX942-NEXT:    v_mov_b32_e32 v3, v11
 ; GFX942-NEXT:    global_store_dwordx4 v12, v[2:5], s[0:1]
@@ -454,12 +446,10 @@ define void @v_shuffle_v4i64_v3i64__5_3_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v4i64_v3i64__5_3_u_u:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1] offset:16
-; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX942-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX942-NEXT:    v_mov_b32_e32 v3, v5
 ; GFX942-NEXT:    v_mov_b32_e32 v4, v0
@@ -504,12 +494,10 @@ define void @v_shuffle_v4i64_v3i64__5_4_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v4i64_v3i64__5_4_u_u:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1] offset:16
-; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX942-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX942-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1]
@@ -552,12 +540,10 @@ define void @v_shuffle_v4i64_v3i64__5_5_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v4i64_v3i64__5_5_u_u:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1] offset:16
-; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX942-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX942-NEXT:    v_mov_b32_e32 v3, v5
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[2:5], s[0:1]
@@ -4713,7 +4699,6 @@ define void @v_shuffle_v4i64_v3i64__1_3_3_3(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -4753,12 +4738,10 @@ define void @v_shuffle_v4i64_v3i64__2_3_3_3(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v4i64_v3i64__2_3_3_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1] offset:16
-; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX942-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX942-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v4i64.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v4i64.ll
index aaaadb95351c0..d026e3b08b171 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v4i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v4i64.ll
@@ -87,7 +87,6 @@ define void @v_shuffle_v4i64_v4i64__1_u_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -127,7 +126,6 @@ define void @v_shuffle_v4i64_v4i64__2_u_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -167,12 +165,10 @@ define void @v_shuffle_v4i64_v4i64__3_u_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v4i64_v4i64__3_u_u_u:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1] offset:16
-; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-NEXT:    v_mov_b32_e32 v0, v6
 ; GFX942-NEXT:    v_mov_b32_e32 v1, v7
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
@@ -225,7 +221,6 @@ define void @v_shuffle_v4i64_v4i64__5_u_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -266,7 +261,6 @@ define void @v_shuffle_v4i64_v4i64__6_u_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -307,12 +301,10 @@ define void @v_shuffle_v4i64_v4i64__7_u_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v4i64_v4i64__7_u_u_u:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1] offset:16
-; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-NEXT:    v_mov_b32_e32 v0, v6
 ; GFX942-NEXT:    v_mov_b32_e32 v1, v7
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
@@ -372,11 +364,11 @@ define void @v_shuffle_v4i64_v4i64__7_0_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[2:9]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    global_store_dwordx4 v10, v[0:3], s[0:1] offset:16
-; GFX942-NEXT:    v_mov_b32_e32 v4, v0
-; GFX942-NEXT:    v_mov_b32_e32 v5, v1
+; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    v_mov_b32_e32 v2, v8
 ; GFX942-NEXT:    v_mov_b32_e32 v3, v9
+; GFX942-NEXT:    v_mov_b32_e32 v4, v0
+; GFX942-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX942-NEXT:    global_store_dwordx4 v10, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -423,15 +415,14 @@ define void @v_shuffle_v4i64_v4i64__7_1_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v4i64_v4i64__7_1_u_u:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b32_e32 v12, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    global_store_dwordx4 v12, v[0:3], s[0:1] offset:16
+; GFX942-NEXT:    v_mov_b32_e32 v12, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    v_mov_b32_e32 v0, v10
 ; GFX942-NEXT:    v_mov_b32_e32 v1, v11
 ; GFX942-NEXT:    global_store_dwordx4 v12, v[0:3], s[0:1]
@@ -480,15 +471,14 @@ define void @v_shuffle_v4i64_v4i64__7_2_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v4i64_v4i64__7_2_u_u:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b32_e32 v14, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    global_store_dwordx4 v14, v[0:3], s[0:1] offset:16
+; GFX942-NEXT:    v_mov_b32_e32 v14, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[6:13]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    v_mov_b32_e32 v2, v12
 ; GFX942-NEXT:    v_mov_b32_e32 v3, v13
 ; GFX942-NEXT:    global_store_dwordx4 v14, v[2:5], s[0:1]
@@ -544,7 +534,7 @@ define void @v_shuffle_v4i64_v4i64__7_3_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    global_store_dwordx4 v16, v[0:3], s[0:1] offset:16
+; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    v_mov_b32_e32 v4, v14
 ; GFX942-NEXT:    v_mov_b32_e32 v5, v15
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1]
@@ -591,15 +581,14 @@ define void @v_shuffle_v4i64_v4i64__7_4_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v4i64_v4i64__7_4_u_u:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1] offset:16
-; GFX942-NEXT:    v_mov_b32_e32 v4, v0
-; GFX942-NEXT:    v_mov_b32_e32 v5, v1
+; GFX942-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-NEXT:    v_mov_b32_e32 v2, v6
 ; GFX942-NEXT:    v_mov_b32_e32 v3, v7
+; GFX942-NEXT:    v_mov_b32_e32 v4, v0
+; GFX942-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -640,12 +629,10 @@ define void @v_shuffle_v4i64_v4i64__7_5_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v4i64_v4i64__7_5_u_u:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1] offset:16
-; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-NEXT:    v_mov_b32_e32 v0, v6
 ; GFX942-NEXT:    v_mov_b32_e32 v1, v7
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
@@ -688,12 +675,10 @@ define void @v_shuffle_v4i64_v4i64__7_6_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v4i64_v4i64__7_6_u_u:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1] offset:16
-; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-NEXT:    v_mov_b32_e32 v2, v6
 ; GFX942-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[2:5], s[0:1]
@@ -742,7 +727,6 @@ define void @v_shuffle_v4i64_v4i64__7_7_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-NEXT:    v_mov_b32_e32 v4, v6
 ; GFX942-NEXT:    v_mov_b32_e32 v5, v7
-; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -7719,7 +7703,6 @@ define void @v_shuffle_v4i64_v4i64__1_4_4_4(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -7759,7 +7742,6 @@ define void @v_shuffle_v4i64_v4i64__2_4_4_4(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -7799,12 +7781,10 @@ define void @v_shuffle_v4i64_v4i64__3_4_4_4(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v4i64_v4i64__3_4_4_4:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1] offset:16
-; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-NEXT:    v_mov_b32_e32 v0, v6
 ; GFX942-NEXT:    v_mov_b32_e32 v1, v7
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v2p0.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v2p0.ll
index e34b6e95da51c..14bacc2f74876 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v2p0.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v2p0.ll
@@ -45,7 +45,6 @@ define void @v_shuffle_v4p0_v2p0__0_u_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -85,12 +84,10 @@ define void @v_shuffle_v4p0_v2p0__1_u_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v4p0_v2p0__1_u_u_u:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
-; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX942-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX942-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX942-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
@@ -143,12 +140,10 @@ define void @v_shuffle_v4p0_v2p0__3_u_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v4p0_v2p0__3_u_u_u:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
-; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX942-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX942-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX942-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
@@ -208,8 +203,7 @@ define void @v_shuffle_v4p0_v2p0__3_0_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[2:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1] offset:16
-; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX942-NEXT:    v_mov_b32_e32 v3, v5
 ; GFX942-NEXT:    v_mov_b32_e32 v4, v0
@@ -260,15 +254,14 @@ define void @v_shuffle_v4p0_v2p0__3_1_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v4p0_v2p0__3_1_u_u:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[4:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1] offset:16
-; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    v_mov_b32_e32 v0, v6
 ; GFX942-NEXT:    v_mov_b32_e32 v1, v7
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
@@ -315,7 +308,7 @@ define void @v_shuffle_v4p0_v2p0__3_2_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1] offset:16
+; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX942-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[2:5], s[0:1]
@@ -358,12 +351,10 @@ define void @v_shuffle_v4p0_v2p0__3_3_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v4p0_v2p0__3_3_u_u:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
-; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX942-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX942-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX942-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
@@ -2371,7 +2362,6 @@ define void @v_shuffle_v4p0_v2p0__0_2_2_2(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -2411,12 +2401,10 @@ define void @v_shuffle_v4p0_v2p0__1_2_2_2(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v4p0_v2p0__1_2_2_2:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
-; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX942-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX942-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX942-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v3p0.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v3p0.ll
index 93b0a34460a49..0398418b82f3d 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v3p0.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v3p0.ll
@@ -87,7 +87,6 @@ define void @v_shuffle_v4p0_v3p0__1_u_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -127,12 +126,10 @@ define void @v_shuffle_v4p0_v3p0__2_u_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v4p0_v3p0__2_u_u_u:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1] offset:16
-; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX942-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX942-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1]
@@ -185,7 +182,6 @@ define void @v_shuffle_v4p0_v3p0__4_u_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -226,12 +222,10 @@ define void @v_shuffle_v4p0_v3p0__5_u_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v4p0_v3p0__5_u_u_u:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1] offset:16
-; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX942-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX942-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1]
@@ -291,11 +285,11 @@ define void @v_shuffle_v4p0_v3p0__5_0_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[2:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1] offset:16
-; GFX942-NEXT:    v_mov_b32_e32 v4, v0
-; GFX942-NEXT:    v_mov_b32_e32 v5, v1
+; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    v_mov_b32_e32 v2, v6
 ; GFX942-NEXT:    v_mov_b32_e32 v3, v7
+; GFX942-NEXT:    v_mov_b32_e32 v4, v0
+; GFX942-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -342,15 +336,14 @@ define void @v_shuffle_v4p0_v3p0__5_1_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v4p0_v3p0__5_1_u_u:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b32_e32 v10, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    global_store_dwordx4 v10, v[0:3], s[0:1] offset:16
+; GFX942-NEXT:    v_mov_b32_e32 v10, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[4:9]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    v_mov_b32_e32 v0, v8
 ; GFX942-NEXT:    v_mov_b32_e32 v1, v9
 ; GFX942-NEXT:    global_store_dwordx4 v10, v[0:3], s[0:1]
@@ -399,15 +392,14 @@ define void @v_shuffle_v4p0_v3p0__5_2_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v4p0_v3p0__5_2_u_u:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b32_e32 v12, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    v_mov_b32_e32 v12, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[6:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    global_store_dwordx4 v12, v[0:3], s[0:1] offset:16
-; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    v_mov_b32_e32 v2, v10
 ; GFX942-NEXT:    v_mov_b32_e32 v3, v11
 ; GFX942-NEXT:    global_store_dwordx4 v12, v[2:5], s[0:1]
@@ -454,12 +446,10 @@ define void @v_shuffle_v4p0_v3p0__5_3_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v4p0_v3p0__5_3_u_u:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1] offset:16
-; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX942-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX942-NEXT:    v_mov_b32_e32 v3, v5
 ; GFX942-NEXT:    v_mov_b32_e32 v4, v0
@@ -504,12 +494,10 @@ define void @v_shuffle_v4p0_v3p0__5_4_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v4p0_v3p0__5_4_u_u:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1] offset:16
-; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX942-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX942-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1]
@@ -552,12 +540,10 @@ define void @v_shuffle_v4p0_v3p0__5_5_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v4p0_v3p0__5_5_u_u:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1] offset:16
-; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX942-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX942-NEXT:    v_mov_b32_e32 v3, v5
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[2:5], s[0:1]
@@ -4713,7 +4699,6 @@ define void @v_shuffle_v4p0_v3p0__1_3_3_3(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -4753,12 +4738,10 @@ define void @v_shuffle_v4p0_v3p0__2_3_3_3(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v4p0_v3p0__2_3_3_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1] offset:16
-; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX942-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX942-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v4p0.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v4p0.ll
index 8a73e474f4ff8..5e61b0b51e280 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v4p0.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v4p0.ll
@@ -87,7 +87,6 @@ define void @v_shuffle_v4p0_v4p0__1_u_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -127,7 +126,6 @@ define void @v_shuffle_v4p0_v4p0__2_u_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -167,12 +165,10 @@ define void @v_shuffle_v4p0_v4p0__3_u_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v4p0_v4p0__3_u_u_u:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1] offset:16
-; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-NEXT:    v_mov_b32_e32 v0, v6
 ; GFX942-NEXT:    v_mov_b32_e32 v1, v7
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
@@ -225,7 +221,6 @@ define void @v_shuffle_v4p0_v4p0__5_u_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -266,7 +261,6 @@ define void @v_shuffle_v4p0_v4p0__6_u_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -307,12 +301,10 @@ define void @v_shuffle_v4p0_v4p0__7_u_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v4p0_v4p0__7_u_u_u:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1] offset:16
-; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-NEXT:    v_mov_b32_e32 v0, v6
 ; GFX942-NEXT:    v_mov_b32_e32 v1, v7
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
@@ -372,11 +364,11 @@ define void @v_shuffle_v4p0_v4p0__7_0_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[2:9]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    global_store_dwordx4 v10, v[0:3], s[0:1] offset:16
-; GFX942-NEXT:    v_mov_b32_e32 v4, v0
-; GFX942-NEXT:    v_mov_b32_e32 v5, v1
+; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    v_mov_b32_e32 v2, v8
 ; GFX942-NEXT:    v_mov_b32_e32 v3, v9
+; GFX942-NEXT:    v_mov_b32_e32 v4, v0
+; GFX942-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX942-NEXT:    global_store_dwordx4 v10, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -423,15 +415,14 @@ define void @v_shuffle_v4p0_v4p0__7_1_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v4p0_v4p0__7_1_u_u:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b32_e32 v12, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    global_store_dwordx4 v12, v[0:3], s[0:1] offset:16
+; GFX942-NEXT:    v_mov_b32_e32 v12, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    v_mov_b32_e32 v0, v10
 ; GFX942-NEXT:    v_mov_b32_e32 v1, v11
 ; GFX942-NEXT:    global_store_dwordx4 v12, v[0:3], s[0:1]
@@ -480,15 +471,14 @@ define void @v_shuffle_v4p0_v4p0__7_2_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v4p0_v4p0__7_2_u_u:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b32_e32 v14, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    global_store_dwordx4 v14, v[0:3], s[0:1] offset:16
+; GFX942-NEXT:    v_mov_b32_e32 v14, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[6:13]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    v_mov_b32_e32 v2, v12
 ; GFX942-NEXT:    v_mov_b32_e32 v3, v13
 ; GFX942-NEXT:    global_store_dwordx4 v14, v[2:5], s[0:1]
@@ -544,7 +534,7 @@ define void @v_shuffle_v4p0_v4p0__7_3_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    global_store_dwordx4 v16, v[0:3], s[0:1] offset:16
+; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    v_mov_b32_e32 v4, v14
 ; GFX942-NEXT:    v_mov_b32_e32 v5, v15
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1]
@@ -591,15 +581,14 @@ define void @v_shuffle_v4p0_v4p0__7_4_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v4p0_v4p0__7_4_u_u:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1] offset:16
-; GFX942-NEXT:    v_mov_b32_e32 v4, v0
-; GFX942-NEXT:    v_mov_b32_e32 v5, v1
+; GFX942-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-NEXT:    v_mov_b32_e32 v2, v6
 ; GFX942-NEXT:    v_mov_b32_e32 v3, v7
+; GFX942-NEXT:    v_mov_b32_e32 v4, v0
+; GFX942-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -640,12 +629,10 @@ define void @v_shuffle_v4p0_v4p0__7_5_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v4p0_v4p0__7_5_u_u:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1] offset:16
-; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-NEXT:    v_mov_b32_e32 v0, v6
 ; GFX942-NEXT:    v_mov_b32_e32 v1, v7
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
@@ -688,12 +675,10 @@ define void @v_shuffle_v4p0_v4p0__7_6_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v4p0_v4p0__7_6_u_u:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1] offset:16
-; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-NEXT:    v_mov_b32_e32 v2, v6
 ; GFX942-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[2:5], s[0:1]
@@ -742,7 +727,6 @@ define void @v_shuffle_v4p0_v4p0__7_7_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-NEXT:    v_mov_b32_e32 v4, v6
 ; GFX942-NEXT:    v_mov_b32_e32 v5, v7
-; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -7719,7 +7703,6 @@ define void @v_shuffle_v4p0_v4p0__1_4_4_4(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -7759,7 +7742,6 @@ define void @v_shuffle_v4p0_v4p0__2_4_4_4(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -7799,12 +7781,10 @@ define void @v_shuffle_v4p0_v4p0__3_4_4_4(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v4p0_v4p0__3_4_4_4:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1] offset:16
-; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-NEXT:    v_mov_b32_e32 v0, v6
 ; GFX942-NEXT:    v_mov_b32_e32 v1, v7
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]

>From 38d8afeeb21ac1189cb7be20b8aef71801208647 Mon Sep 17 00:00:00 2001
From: Janek van Oirschot <Janek.vanOirschot at amd.com>
Date: Wed, 25 Jun 2025 05:32:55 -0700
Subject: [PATCH 3/6] Straightforward feedback comments

---
 llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp |  2 +-
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp |  8 -------
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     | 21 ++++++++++++-------
 3 files changed, 15 insertions(+), 16 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 044c073e7f918..816f42e0ecd42 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -441,7 +441,7 @@ void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) {
   SDLoc DL(N);
   SDValue RegClass = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32);
   unsigned NumRegs = EltVT.getSizeInBits() / 32;
-  bool IsGCN = CurDAG->getSubtarget().getTargetTriple().isAMDGCN();
+  bool IsGCN = TM.getTargetTriple().isAMDGCN();
 
   if (NumVectorElts == 1) {
     CurDAG->SelectNodeTo(N, AMDGPU::COPY_TO_REGCLASS, EltVT, N->getOperand(0),
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 8b69808db874d..d75c7a178b4a8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -5270,14 +5270,6 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::BITCAST: {
     EVT DestVT = N->getValueType(0);
 
-    // Avoid undoing build_vector with 64b elements if subtarget supports 64b
-    // movs (i.e., avoid inf loop through combines).
-    if (Subtarget->isGCN()) {
-      const GCNSubtarget &ST = DAG.getSubtarget<GCNSubtarget>();
-      if (ST.hasMovB64())
-        break;
-    }
-
     // Push casts through vector builds. This helps avoid emitting a large
     // number of copies when materializing floating point vector constants.
     //
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 88550ee77a9c9..4ee42cbae9723 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -961,6 +961,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
 
   setTargetDAGCombine({ISD::ADD,
                        ISD::PTRADD,
+                       ISD::BITCAST,
                        ISD::BUILD_VECTOR,
                        ISD::UADDO_CARRY,
                        ISD::SUB,
@@ -15705,7 +15706,6 @@ SITargetLowering::performBuildVectorCombine(SDNode *N,
 
   SelectionDAG &DAG = DCI.DAG;
   SDLoc SL(N);
-  BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N);
 
   EVT VT = N->getValueType(0);
   EVT EltVT = VT.getVectorElementType();
@@ -15713,10 +15713,9 @@ SITargetLowering::performBuildVectorCombine(SDNode *N,
   unsigned EltSize = EltVT.getSizeInBits();
 
   // Skip if:
-  //  - Value type isn't multiplication of 64 bit (e.g., v3i32), or
-  //  - BuildVector instruction has non-constants, or
-  //  - Element type has already been combined into i64 elements
-  if ((SizeBits % 64) != 0 || !BVN->isConstant() || EltVT == MVT::i64)
+  //  - Value type isn't multiple of 64 bit (e.g., v3i32), or
+  //  - Element type has already been combined into 64b elements
+  if ((SizeBits % 64) != 0 || EltVT == MVT::i64 || EltVT == MVT::f64)
     return SDValue();
 
   // Construct the 64b values.
@@ -15724,6 +15723,7 @@ SITargetLowering::performBuildVectorCombine(SDNode *N,
   uint64_t ImmVal = 0;
   uint64_t ImmSize = 0;
   for (SDValue Opand : N->ops()) {
+    // Build_vector with constants only.
     ConstantSDNode *C = dyn_cast<ConstantSDNode>(Opand);
     if (!C)
       return SDValue();
@@ -15755,8 +15755,7 @@ SITargetLowering::performBuildVectorCombine(SDNode *N,
     unsigned NewNumElts = SizeBits / 64;
     LLVMContext &Ctx = *DAG.getContext();
     EVT NewVT = EVT::getVectorVT(Ctx, MVT::i64, NewNumElts);
-    SDValue BV = DAG.getBuildVector(
-        NewVT, SL, ArrayRef(VectorConsts.begin(), VectorConsts.end()));
+    SDValue BV = DAG.getBuildVector(NewVT, SL, VectorConsts);
     return DAG.getBitcast(VT, BV);
   }
   return SDValue();
@@ -15905,6 +15904,14 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
     return performInsertVectorEltCombine(N, DCI);
   case ISD::FP_ROUND:
     return performFPRoundCombine(N, DCI);
+  case ISD::BITCAST: {
+    // Avoid undoing build_vector with 64b elements if subtarget supports 64b
+    // movs (i.e., avoid inf loop through combines).
+    const GCNSubtarget *ST = getSubtarget();
+    if (ST->hasMovB64())
+      return SDValue();
+    break;
+  }
   case ISD::LOAD: {
     if (SDValue Widened = widenLoad(cast<LoadSDNode>(N), DCI))
       return Widened;

>From 10e2de6c1c076b3e828d066169c8ad47cd183fcd Mon Sep 17 00:00:00 2001
From: Janek van Oirschot <Janek.vanOirschot at amd.com>
Date: Thu, 26 Jun 2025 08:54:56 -0700
Subject: [PATCH 4/6] Remove explicitly setting BUILD_VECTOR 64b elt to legal
 as it is the default

---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 20 +++++---------------
 1 file changed, 5 insertions(+), 15 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 4ee42cbae9723..fa60a38f12acb 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -357,9 +357,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
   // Most operations are naturally 32-bit vector operations. We only support
   // load and store of i64 vectors, so promote v2i64 vector operations to v4i32.
   for (MVT Vec64 : {MVT::v2i64, MVT::v2f64}) {
-    if (STI.hasMovB64())
-      setOperationAction(ISD::BUILD_VECTOR, Vec64, Legal);
-    else {
+    if (!STI.hasMovB64()) {
       setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote);
       AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v4i32);
     }
@@ -374,9 +372,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
   }
 
   for (MVT Vec64 : {MVT::v3i64, MVT::v3f64}) {
-    if (STI.hasMovB64())
-      setOperationAction(ISD::BUILD_VECTOR, Vec64, Legal);
-    else {
+    if (!STI.hasMovB64()) {
       setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote);
       AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v6i32);
     }
@@ -391,9 +387,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
   }
 
   for (MVT Vec64 : {MVT::v4i64, MVT::v4f64}) {
-    if (STI.hasMovB64())
-      setOperationAction(ISD::BUILD_VECTOR, Vec64, Legal);
-    else {
+    if (!STI.hasMovB64()) {
       setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote);
       AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v8i32);
     }
@@ -408,9 +402,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
   }
 
   for (MVT Vec64 : {MVT::v8i64, MVT::v8f64}) {
-    if (STI.hasMovB64())
-      setOperationAction(ISD::BUILD_VECTOR, Vec64, Legal);
-    else {
+    if (!STI.hasMovB64()) {
       setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote);
       AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v16i32);
     }
@@ -425,9 +417,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
   }
 
   for (MVT Vec64 : {MVT::v16i64, MVT::v16f64}) {
-    if (STI.hasMovB64())
-      setOperationAction(ISD::BUILD_VECTOR, Vec64, Legal);
-    else {
+    if (!STI.hasMovB64()) {
       setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote);
       AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v32i32);
     }

>From 4be7a0ca3d38432620b099edfb9c15302f373996 Mon Sep 17 00:00:00 2001
From: Janek van Oirschot <Janek.vanOirschot at amd.com>
Date: Tue, 1 Jul 2025 07:05:20 -0700
Subject: [PATCH 5/6] Update mfma test after rebase

---
 .../CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll | 85 +++++++++++++------
 1 file changed, 59 insertions(+), 26 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll
index a9cffd6e1c943..5d0d0a39d0dff 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll
@@ -210,14 +210,26 @@ bb:
 }
 
 ; GCN-LABEL: {{^}}test_mfma_f64_16x16x4f64_splat_imm_int_64_in_high_bits:
-; GCN: v_accvgpr_write_b32 a[[A_LOW_BITS_0:[0-9]+]], 0{{$}}
-; GCN: v_accvgpr_write_b32 a[[A_HIGH_BITS_0:[0-9]+]], 64
-; GCN: v_accvgpr_mov_b32 a{{[0-9]+}}, a[[A_LOW_BITS_0]]
-; GCN: v_accvgpr_mov_b32 a{{[0-9]+}}, a[[A_HIGH_BITS_0]]
-; GCN: v_accvgpr_mov_b32 a{{[0-9]+}}, a[[A_LOW_BITS_0]]
-; GCN: v_accvgpr_mov_b32 a{{[0-9]+}}, a[[A_HIGH_BITS_0]]
-; GCN: v_accvgpr_mov_b32 a{{[0-9]+}}, a[[A_LOW_BITS_0]]
-; GCN: v_accvgpr_mov_b32 a[[LAST_CONST_REG:[0-9]+]], a[[A_HIGH_BITS_0]]
+; GFX90A: v_accvgpr_write_b32 a[[A_LOW_BITS_0:[0-9]+]], 0{{$}}
+; GFX90A: v_accvgpr_write_b32 a[[A_HIGH_BITS_0:[0-9]+]], 64
+; GFX90A: v_accvgpr_mov_b32 a{{[0-9]+}}, a[[A_LOW_BITS_0]]
+; GFX90A: v_accvgpr_mov_b32 a{{[0-9]+}}, a[[A_HIGH_BITS_0]]
+; GFX90A: v_accvgpr_mov_b32 a{{[0-9]+}}, a[[A_LOW_BITS_0]]
+; GFX90A: v_accvgpr_mov_b32 a{{[0-9]+}}, a[[A_HIGH_BITS_0]]
+; GFX90A: v_accvgpr_mov_b32 a{{[0-9]+}}, a[[A_LOW_BITS_0]]
+; GFX90A: v_accvgpr_mov_b32 a[[LAST_CONST_REG:[0-9]+]], a[[A_HIGH_BITS_0]]
+
+; GFX942: s_mov_b32 s[[S_LOW_BITS_0:[0-9]+]], 0{{$}}
+; GFX942: s_mov_b32 s[[S_HIGH_BITS_64:[0-9]+]], 64
+; GFX942: v_mov_b64_e32 v[[[V_LOW_BITS_0:[0-9]+]]:[[V_HIGH_BITS_64:[0-9]+]]], s[[[S_LOW_BITS_0]]:[[S_HIGH_BITS_64]]]
+; GFX942: v_accvgpr_write_b32 a[[A_LOW_BITS_0:[0-9]+]], v[[V_LOW_BITS_0]]
+; GFX942: v_accvgpr_write_b32 a[[A_HIGH_BITS_0:[0-9]+]], v[[V_HIGH_BITS_64]]
+; GFX942: v_accvgpr_mov_b32 a{{[0-9]+}}, a[[A_HIGH_BITS_0]]
+; GFX942: v_accvgpr_mov_b32 a{{[0-9]+}}, a[[A_HIGH_BITS_0]]
+; GFX942: v_accvgpr_mov_b32 a[[LAST_CONST_REG:[0-9]+]], a[[A_HIGH_BITS_0]]
+; GFX942: v_accvgpr_mov_b32 a{{[0-9]+}}, a[[A_LOW_BITS_0]]
+; GFX942: v_accvgpr_mov_b32 a{{[0-9]+}}, a[[A_LOW_BITS_0]]
+; GFX942: v_accvgpr_mov_b32 a{{[0-9]+}}, a[[A_LOW_BITS_0]]
 
 ; GFX90A: v_mfma_f64_16x16x4f64 [[M1:a\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], a{{\[}}[[A_LOW_BITS_0]]:[[LAST_CONST_REG]]{{\]$}}
 ; GFX90A: v_mfma_f64_16x16x4f64 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], [[M1]] cbsz:1 abid:2 blgp:3
@@ -234,14 +246,23 @@ bb:
 }
 
 ; GCN-LABEL: {{^}}test_mfma_f64_16x16x4f64_splat_imm_int_64_in_high_and_low:
-; GCN: v_accvgpr_write_b32 a[[A_LOW_BITS_0:[0-9]+]], 64{{$}}
-; GCN: v_accvgpr_mov_b32 a{{[0-9]+}}, a[[A_LOW_BITS_0]]
-; GCN: v_accvgpr_mov_b32 a{{[0-9]+}}, a[[A_LOW_BITS_0]]
-; GCN: v_accvgpr_mov_b32 a{{[0-9]+}}, a[[A_LOW_BITS_0]]
-; GCN: v_accvgpr_mov_b32 a{{[0-9]+}}, a[[A_LOW_BITS_0]]
-; GCN: v_accvgpr_mov_b32 a{{[0-9]+}}, a[[A_LOW_BITS_0]]
-; GCN: v_accvgpr_mov_b32 a{{[0-9]+}}, a[[A_LOW_BITS_0]]
-; GCN: v_accvgpr_mov_b32 a[[LAST_CONST_REG:[0-9]+]], a[[A_LOW_BITS_0]]
+; GFX90A: v_accvgpr_write_b32 a[[A_LOW_BITS_0:[0-9]+]], 64{{$}}
+; GFX90A: v_accvgpr_mov_b32 a{{[0-9]+}}, a[[A_LOW_BITS_0]]
+; GFX90A: v_accvgpr_mov_b32 a{{[0-9]+}}, a[[A_LOW_BITS_0]]
+; GFX90A: v_accvgpr_mov_b32 a{{[0-9]+}}, a[[A_LOW_BITS_0]]
+; GFX90A: v_accvgpr_mov_b32 a{{[0-9]+}}, a[[A_LOW_BITS_0]]
+; GFX90A: v_accvgpr_mov_b32 a{{[0-9]+}}, a[[A_LOW_BITS_0]]
+; GFX90A: v_accvgpr_mov_b32 a{{[0-9]+}}, a[[A_LOW_BITS_0]]
+; GFX90A: v_accvgpr_mov_b32 a[[LAST_CONST_REG:[0-9]+]], a[[A_LOW_BITS_0]]
+
+; GFX942: v_accvgpr_write_b32 a[[A_LOW_BITS_0:[0-9]+]], 64{{$}}
+; GFX942: v_accvgpr_mov_b32 a[[A_HIGH_BITS_0:[0-9]+]], a[[A_LOW_BITS_0]]
+; GFX942: v_accvgpr_mov_b32 a{{[0-9]+}}, a[[A_HIGH_BITS_0]]
+; GFX942: v_accvgpr_mov_b32 a{{[0-9]+}}, a[[A_HIGH_BITS_0]]
+; GFX942: v_accvgpr_mov_b32 a[[LAST_CONST_REG:[0-9]+]], a[[A_HIGH_BITS_0]]
+; GFX942: v_accvgpr_mov_b32 a{{[0-9]+}}, a[[A_LOW_BITS_0]]
+; GFX942: v_accvgpr_mov_b32 a{{[0-9]+}}, a[[A_LOW_BITS_0]]
+; GFX942: v_accvgpr_mov_b32 a{{[0-9]+}}, a[[A_LOW_BITS_0]]
 
 ; GFX90A: v_mfma_f64_16x16x4f64 [[M1:a\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], a{{\[}}[[A_LOW_BITS_0]]:[[LAST_CONST_REG]]{{\]$}}
 ; GFX90A: v_mfma_f64_16x16x4f64 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], [[M1]] cbsz:1 abid:2 blgp:3
@@ -258,14 +279,23 @@ bb:
 }
 
 ; GCN-LABEL: {{^}}test_mfma_f64_16x16x4f64_splat_imm_f32_1_in_high_and_low:
-; GCN: v_accvgpr_write_b32 a[[A_LOW_BITS_0:[0-9]+]], 1.0
-; GCN: v_accvgpr_mov_b32 a{{[0-9]+}}, a[[A_LOW_BITS_0]]
-; GCN: v_accvgpr_mov_b32 a{{[0-9]+}}, a[[A_LOW_BITS_0]]
-; GCN: v_accvgpr_mov_b32 a{{[0-9]+}}, a[[A_LOW_BITS_0]]
-; GCN: v_accvgpr_mov_b32 a{{[0-9]+}}, a[[A_LOW_BITS_0]]
-; GCN: v_accvgpr_mov_b32 a{{[0-9]+}}, a[[A_LOW_BITS_0]]
-; GCN: v_accvgpr_mov_b32 a{{[0-9]+}}, a[[A_LOW_BITS_0]]
-; GCN: v_accvgpr_mov_b32 a[[LAST_CONST_REG:[0-9]+]], a[[A_LOW_BITS_0]]
+; GFX90A: v_accvgpr_write_b32 a[[A_LOW_BITS_0:[0-9]+]], 1.0
+; GFX90A: v_accvgpr_mov_b32 a{{[0-9]+}}, a[[A_LOW_BITS_0]]
+; GFX90A: v_accvgpr_mov_b32 a{{[0-9]+}}, a[[A_LOW_BITS_0]]
+; GFX90A: v_accvgpr_mov_b32 a{{[0-9]+}}, a[[A_LOW_BITS_0]]
+; GFX90A: v_accvgpr_mov_b32 a{{[0-9]+}}, a[[A_LOW_BITS_0]]
+; GFX90A: v_accvgpr_mov_b32 a{{[0-9]+}}, a[[A_LOW_BITS_0]]
+; GFX90A: v_accvgpr_mov_b32 a{{[0-9]+}}, a[[A_LOW_BITS_0]]
+; GFX90A: v_accvgpr_mov_b32 a[[LAST_CONST_REG:[0-9]+]], a[[A_LOW_BITS_0]]
+
+; GFX942: v_accvgpr_write_b32 a[[A_LOW_BITS_0:[0-9]+]], 1.0
+; GFX942: v_accvgpr_mov_b32 a[[A_HIGH_BITS_0:[0-9]+]], a[[A_LOW_BITS_0]]
+; GFX942: v_accvgpr_mov_b32 a{{[0-9]+}}, a[[A_HIGH_BITS_0]]
+; GFX942: v_accvgpr_mov_b32 a{{[0-9]+}}, a[[A_HIGH_BITS_0]]
+; GFX942: v_accvgpr_mov_b32 a[[LAST_CONST_REG:[0-9]+]], a[[A_HIGH_BITS_0]]
+; GFX942: v_accvgpr_mov_b32 a{{[0-9]+}}, a[[A_LOW_BITS_0]]
+; GFX942: v_accvgpr_mov_b32 a{{[0-9]+}}, a[[A_LOW_BITS_0]]
+; GFX942: v_accvgpr_mov_b32 a{{[0-9]+}}, a[[A_LOW_BITS_0]]
 
 ; GFX90A: v_mfma_f64_16x16x4f64 [[M1:a\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], a{{\[}}[[A_LOW_BITS_0]]:[[LAST_CONST_REG]]{{\]$}}
 ; GFX90A: v_mfma_f64_16x16x4f64 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], [[M1]] cbsz:1 abid:2 blgp:3
@@ -294,8 +324,11 @@ bb:
 }
 
 ; GCN-LABEL: {{^}}test_mfma_f64_16x16x4f64_splat_lit:
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
-; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x405ec000
+; GFX90A-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
+; GFX90A-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x405ec000
+; GFX942-DAG: s_mov_b32 s[[S_LOW_BITS:[0-9]+]], 0{{$}}
+; GFX942-DAG: s_mov_b32 s[[S_HIGH_BITS:[0-9]+]], 0x405ec000
+; GFX942-DAG: v_mov_b64_e32 v[{{[0-9]+:[0-9]+}}], s[[[S_LOW_BITS]]:[[S_HIGH_BITS]]]
 ; GFX90A:  v_mfma_f64_16x16x4f64 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], a[{{[0-9]+:[0-9]+}}]{{$}}
 ; GFX942:  v_mfma_f64_16x16x4_f64 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], a[{{[0-9]+:[0-9]+}}]{{$}}
 ; GCN:     global_store_dwordx4

>From 672759343d0cb2e433435c7d50f0348d3016e18e Mon Sep 17 00:00:00 2001
From: Janek van Oirschot <Janek.vanOirschot at amd.com>
Date: Wed, 2 Jul 2025 07:36:04 -0700
Subject: [PATCH 6/6] Forgotten fp constant support in combine

---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     | 10 ++-
 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll  | 90 ++++++++++---------
 .../CodeGen/AMDGPU/masked-load-vectortypes.ll | 25 +++---
 llvm/test/CodeGen/AMDGPU/mfma-loop.ll         | 65 +++++++-------
 llvm/test/CodeGen/AMDGPU/packed-fp32.ll       | 49 +++++++---
 5 files changed, 132 insertions(+), 107 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index fa60a38f12acb..b752e44f08e18 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -15715,10 +15715,12 @@ SITargetLowering::performBuildVectorCombine(SDNode *N,
   for (SDValue Opand : N->ops()) {
     // Build_vector with constants only.
     ConstantSDNode *C = dyn_cast<ConstantSDNode>(Opand);
-    if (!C)
+    ConstantFPSDNode *FPC = dyn_cast<ConstantFPSDNode>(Opand);
+    if (!C && !FPC)
       return SDValue();
-
-    ImmVal |= C->getZExtValue() << ImmSize;
+    uint64_t Val = C ? C->getZExtValue()
+                     : FPC->getValueAPF().bitcastToAPInt().getZExtValue();
+    ImmVal |= Val << ImmSize;
     ImmSize += EltSize;
     if (ImmSize > 64)
       return SDValue();
@@ -15734,7 +15736,7 @@ SITargetLowering::performBuildVectorCombine(SDNode *N,
   // Avoid emitting build_vector with 1 element and directly emit value.
   if (ImmVals.size() == 1) {
     SDValue Val = DAG.getConstant(ImmVals[0], SL, MVT::i64);
-    return DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Val);
+    return DAG.getBitcast(VT, Val);
   }
 
   // Construct and return build_vector with 64b elements.
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll
index 67ae05eb6f0b8..14d500c116b87 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll
@@ -4617,55 +4617,57 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm(ptr addrspace(1) %arg) #
 ;
 ; GFX942-LABEL: test_mfma_f32_32x32x1f32_imm:
 ; GFX942:       ; %bb.0: ; %bb
-; GFX942-NEXT:    v_mov_b32_e32 v1, 1.0
-; GFX942-NEXT:    v_accvgpr_write_b32 a1, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a0, 1.0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a2, a1
-; GFX942-NEXT:    v_accvgpr_mov_b32 a3, a1
-; GFX942-NEXT:    v_accvgpr_mov_b32 a4, a1
-; GFX942-NEXT:    v_accvgpr_mov_b32 a5, a1
-; GFX942-NEXT:    v_accvgpr_mov_b32 a6, a1
-; GFX942-NEXT:    v_accvgpr_mov_b32 a7, a1
-; GFX942-NEXT:    v_accvgpr_mov_b32 a8, a1
-; GFX942-NEXT:    v_accvgpr_mov_b32 a9, a1
-; GFX942-NEXT:    v_accvgpr_mov_b32 a10, a1
-; GFX942-NEXT:    v_accvgpr_mov_b32 a11, a1
-; GFX942-NEXT:    v_accvgpr_mov_b32 a12, a1
-; GFX942-NEXT:    v_accvgpr_mov_b32 a13, a1
-; GFX942-NEXT:    v_accvgpr_mov_b32 a14, a1
-; GFX942-NEXT:    v_accvgpr_mov_b32 a15, a1
-; GFX942-NEXT:    v_accvgpr_mov_b32 a16, a1
-; GFX942-NEXT:    v_accvgpr_mov_b32 a17, a1
-; GFX942-NEXT:    v_accvgpr_mov_b32 a18, a1
-; GFX942-NEXT:    v_accvgpr_mov_b32 a19, a1
-; GFX942-NEXT:    v_accvgpr_mov_b32 a20, a1
-; GFX942-NEXT:    v_accvgpr_mov_b32 a21, a1
-; GFX942-NEXT:    v_accvgpr_mov_b32 a22, a1
-; GFX942-NEXT:    v_accvgpr_mov_b32 a23, a1
-; GFX942-NEXT:    v_accvgpr_mov_b32 a24, a1
-; GFX942-NEXT:    v_accvgpr_mov_b32 a25, a1
-; GFX942-NEXT:    v_accvgpr_mov_b32 a26, a1
-; GFX942-NEXT:    v_accvgpr_mov_b32 a27, a1
-; GFX942-NEXT:    v_accvgpr_mov_b32 a28, a1
-; GFX942-NEXT:    v_accvgpr_mov_b32 a29, a1
-; GFX942-NEXT:    v_accvgpr_mov_b32 a30, a1
-; GFX942-NEXT:    v_accvgpr_mov_b32 a31, a1
-; GFX942-NEXT:    v_mov_b32_e32 v2, 2.0
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], 0x3f800000
+; GFX942-NEXT:    v_accvgpr_write_b32 a0, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a1, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a3, v1
+; GFX942-NEXT:    v_accvgpr_write_b32 a5, v1
+; GFX942-NEXT:    v_accvgpr_write_b32 a7, v1
+; GFX942-NEXT:    v_accvgpr_write_b32 a9, v1
+; GFX942-NEXT:    v_accvgpr_write_b32 a11, v1
+; GFX942-NEXT:    v_accvgpr_write_b32 a13, v1
+; GFX942-NEXT:    v_accvgpr_write_b32 a15, v1
+; GFX942-NEXT:    v_accvgpr_write_b32 a17, v1
+; GFX942-NEXT:    v_accvgpr_write_b32 a19, v1
+; GFX942-NEXT:    v_accvgpr_write_b32 a21, v1
+; GFX942-NEXT:    v_accvgpr_write_b32 a23, v1
+; GFX942-NEXT:    v_accvgpr_write_b32 a25, v1
+; GFX942-NEXT:    v_accvgpr_write_b32 a2, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a4, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a6, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a8, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a10, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a12, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a14, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a16, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a18, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a20, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a22, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a24, v0
+; GFX942-NEXT:    v_mov_b32_e32 v0, 1.0
+; GFX942-NEXT:    v_accvgpr_mov_b32 a27, a3
+; GFX942-NEXT:    v_accvgpr_mov_b32 a29, a3
+; GFX942-NEXT:    v_accvgpr_mov_b32 a31, a3
+; GFX942-NEXT:    v_accvgpr_mov_b32 a26, a2
+; GFX942-NEXT:    v_accvgpr_mov_b32 a28, a2
+; GFX942-NEXT:    v_accvgpr_mov_b32 a30, a2
+; GFX942-NEXT:    v_mov_b32_e32 v1, 2.0
 ; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX942-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-NEXT:    v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v2, a[0:31]
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-NEXT:    v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, a[0:31]
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-NEXT:    s_nop 7
 ; GFX942-NEXT:    s_nop 7
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
-; GFX942-NEXT:    global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
-; GFX942-NEXT:    global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
-; GFX942-NEXT:    global_store_dwordx4 v0, a[16:19], s[0:1] offset:64
-; GFX942-NEXT:    global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
-; GFX942-NEXT:    global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
-; GFX942-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
-; GFX942-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX942-NEXT:    global_store_dwordx4 v2, a[28:31], s[0:1] offset:112
+; GFX942-NEXT:    global_store_dwordx4 v2, a[24:27], s[0:1] offset:96
+; GFX942-NEXT:    global_store_dwordx4 v2, a[20:23], s[0:1] offset:80
+; GFX942-NEXT:    global_store_dwordx4 v2, a[16:19], s[0:1] offset:64
+; GFX942-NEXT:    global_store_dwordx4 v2, a[12:15], s[0:1] offset:48
+; GFX942-NEXT:    global_store_dwordx4 v2, a[8:11], s[0:1] offset:32
+; GFX942-NEXT:    global_store_dwordx4 v2, a[4:7], s[0:1] offset:16
+; GFX942-NEXT:    global_store_dwordx4 v2, a[0:3], s[0:1]
 ; GFX942-NEXT:    s_endpgm
 bb:
   %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> <float 1.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0>, i32 0, i32 0, i32 0)
diff --git a/llvm/test/CodeGen/AMDGPU/masked-load-vectortypes.ll b/llvm/test/CodeGen/AMDGPU/masked-load-vectortypes.ll
index 88032317165bb..0b77ea604e5b8 100644
--- a/llvm/test/CodeGen/AMDGPU/masked-load-vectortypes.ll
+++ b/llvm/test/CodeGen/AMDGPU/masked-load-vectortypes.ll
@@ -54,13 +54,12 @@ define <4 x float> @uniform_masked_load_ptr1_mask_v4f32(ptr addrspace(1) inreg n
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX942-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-NEXT:    v_mov_b32_e32 v1, v0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v0
-; GFX942-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], 0
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
 ; GFX942-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX942-NEXT:    s_cbranch_execz .LBB2_2
 ; GFX942-NEXT:  ; %bb.1: ; %cond.load
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    global_load_dwordx4 v[0:3], v0, s[0:1]
 ; GFX942-NEXT:  .LBB2_2:
 ; GFX942-NEXT:    s_or_b64 exec, exec, s[2:3]
@@ -106,20 +105,16 @@ define <8 x float> @uniform_masked_load_ptr1_mask_v8f32(ptr addrspace(1) inreg n
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX942-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-NEXT:    v_mov_b32_e32 v1, v0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v0
-; GFX942-NEXT:    v_mov_b32_e32 v3, v0
-; GFX942-NEXT:    v_mov_b32_e32 v4, v0
-; GFX942-NEXT:    v_mov_b32_e32 v5, v0
-; GFX942-NEXT:    v_mov_b32_e32 v6, v0
-; GFX942-NEXT:    v_mov_b32_e32 v7, v0
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], 0
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[0:1]
+; GFX942-NEXT:    v_mov_b64_e32 v[6:7], v[0:1]
 ; GFX942-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX942-NEXT:    s_cbranch_execz .LBB4_2
 ; GFX942-NEXT:  ; %bb.1: ; %cond.load
-; GFX942-NEXT:    global_load_dwordx4 v[4:7], v0, s[0:1] offset:16
-; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    global_load_dwordx4 v[0:3], v0, s[0:1]
+; GFX942-NEXT:    v_mov_b32_e32 v8, 0
+; GFX942-NEXT:    global_load_dwordx4 v[4:7], v8, s[0:1] offset:16
+; GFX942-NEXT:    global_load_dwordx4 v[0:3], v8, s[0:1]
 ; GFX942-NEXT:  .LBB4_2:
 ; GFX942-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/mfma-loop.ll b/llvm/test/CodeGen/AMDGPU/mfma-loop.ll
index 244b68c703809..6edd8c0b16ba9 100644
--- a/llvm/test/CodeGen/AMDGPU/mfma-loop.ll
+++ b/llvm/test/CodeGen/AMDGPU/mfma-loop.ll
@@ -2498,39 +2498,40 @@ define amdgpu_kernel void @test_mfma_nested_loop_zeroinit(ptr addrspace(1) %arg)
 ;
 ; GFX942-LABEL: test_mfma_nested_loop_zeroinit:
 ; GFX942:       ; %bb.0: ; %entry
-; GFX942-NEXT:    v_accvgpr_write_b32 a0, 0
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a0, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a3, v1
+; GFX942-NEXT:    v_accvgpr_write_b32 a5, v1
+; GFX942-NEXT:    v_accvgpr_write_b32 a7, v1
+; GFX942-NEXT:    v_accvgpr_write_b32 a9, v1
+; GFX942-NEXT:    v_accvgpr_write_b32 a11, v1
+; GFX942-NEXT:    v_accvgpr_write_b32 a13, v1
+; GFX942-NEXT:    v_accvgpr_write_b32 a15, v1
+; GFX942-NEXT:    v_accvgpr_write_b32 a17, v1
+; GFX942-NEXT:    v_accvgpr_write_b32 a19, v1
+; GFX942-NEXT:    v_accvgpr_write_b32 a21, v1
+; GFX942-NEXT:    v_accvgpr_write_b32 a23, v1
+; GFX942-NEXT:    v_accvgpr_write_b32 a25, v1
+; GFX942-NEXT:    v_accvgpr_write_b32 a27, v1
+; GFX942-NEXT:    v_accvgpr_write_b32 a29, v1
+; GFX942-NEXT:    v_accvgpr_write_b32 a31, v1
 ; GFX942-NEXT:    s_mov_b32 s0, 0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a1, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a2, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a3, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a4, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a5, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a6, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a7, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a8, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a9, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a10, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a11, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a12, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a13, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a14, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a15, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a16, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a17, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a18, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a19, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a20, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a21, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a22, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a23, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a24, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a25, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a26, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a27, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a28, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a29, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a30, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a31, a0
+; GFX942-NEXT:    v_accvgpr_write_b32 a1, v1
+; GFX942-NEXT:    v_accvgpr_write_b32 a2, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a4, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a6, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a8, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a10, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a12, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a14, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a16, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a18, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a20, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a22, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a24, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a26, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a28, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a30, v0
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 2.0
 ; GFX942-NEXT:    v_mov_b32_e32 v1, 1.0
 ; GFX942-NEXT:  .LBB9_1: ; %for.cond.preheader
diff --git a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
index a338bcbd3ba5c..494c473c69e36 100644
--- a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
+++ b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
@@ -490,18 +490,43 @@ define amdgpu_kernel void @fadd_v2_v_lit_hi0(ptr addrspace(1) %a) {
 ; GFX900-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX900-NEXT:    s_endpgm
 ;
-; PACKED-LABEL: fadd_v2_v_lit_hi0:
-; PACKED:       ; %bb.0:
-; PACKED-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; PACKED-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; PACKED-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
-; PACKED-NEXT:    s_mov_b64 s[2:3], 0x3f800000
-; PACKED-NEXT:    s_waitcnt lgkmcnt(0)
-; PACKED-NEXT:    global_load_dwordx2 v[0:1], v2, s[0:1]
-; PACKED-NEXT:    s_waitcnt vmcnt(0)
-; PACKED-NEXT:    v_pk_add_f32 v[0:1], v[0:1], s[2:3]
-; PACKED-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
-; PACKED-NEXT:    s_endpgm
+; GFX90A-SDAG-LABEL: fadd_v2_v_lit_hi0:
+; GFX90A-SDAG:       ; %bb.0:
+; GFX90A-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX90A-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX90A-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
+; GFX90A-SDAG-NEXT:    s_mov_b64 s[2:3], 0x3f800000
+; GFX90A-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-SDAG-NEXT:    global_load_dwordx2 v[0:1], v2, s[0:1]
+; GFX90A-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-SDAG-NEXT:    v_pk_add_f32 v[0:1], v[0:1], s[2:3]
+; GFX90A-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX90A-SDAG-NEXT:    s_endpgm
+;
+; PACKED-GISEL-LABEL: fadd_v2_v_lit_hi0:
+; PACKED-GISEL:       ; %bb.0:
+; PACKED-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; PACKED-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; PACKED-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
+; PACKED-GISEL-NEXT:    s_mov_b64 s[2:3], 0x3f800000
+; PACKED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; PACKED-GISEL-NEXT:    global_load_dwordx2 v[0:1], v2, s[0:1]
+; PACKED-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; PACKED-GISEL-NEXT:    v_pk_add_f32 v[0:1], v[0:1], s[2:3]
+; PACKED-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; PACKED-GISEL-NEXT:    s_endpgm
+;
+; GFX942-SDAG-LABEL: fadd_v2_v_lit_hi0:
+; GFX942-SDAG:       ; %bb.0:
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX942-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    global_load_dwordx2 v[0:1], v2, s[0:1]
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-SDAG-NEXT:    v_pk_add_f32 v[0:1], v[0:1], 1.0
+; GFX942-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX942-SDAG-NEXT:    s_endpgm
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
   %load = load <2 x float>, ptr addrspace(1) %gep, align 8



More information about the llvm-commits mailing list