[llvm] [AMDGPU] Add IR LiveReg type-based optimization (PR #66838)

Jeffrey Byrnes via llvm-commits llvm-commits at lists.llvm.org
Tue Sep 19 16:14:06 PDT 2023


https://github.com/jrbyrnes created https://github.com/llvm/llvm-project/pull/66838

NOTE: This commit is part of a stack which spans across phabricator. The PR is meant only for the top of the stack ([AMDGPU] Add IR LiveReg type-based optimization).

As suggested in https://github.com/llvm/llvm-project/pull/66134, this adds the IR level logic to coerce the type of illegal vectors which have live ranges that span across basic blocks.

The issue is that local ISel will emit CopyToReg / CopyFromReg pairs for live ranges spanning basic blocks. For illegal vector types, the DAGBuilder will legalize by scalarizing the vector, then widening each scalar, and passing each scalar via a separate physical register. See https://godbolt.org/z/Y7MhcjGE8 for a demo of the issue.

This feature identifies cases like these, and inserts bitcasts between the def of the illegal vector and the uses in different blocks. This results in avoiding the scalarization process and an ability to pack the bits into fewer registers -- for example, we now use 2 VGPR for a v8i8 instead of 8. 


>From 907891e95df841ca21696b54adebe47d15e977dd Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Mon, 28 Aug 2023 15:44:23 -0700
Subject: [PATCH 1/4] [AMDGPU]: Allow combining into v_dot4

Differential Revision: https://reviews.llvm.org/D155995

Change-Id: Ia88bffe3059eef6b02964d5e2a5fc208a066914d
---
 llvm/include/llvm/CodeGen/ByteProvider.h  |   16 +
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp |  399 ++-
 llvm/test/CodeGen/AMDGPU/idot2.ll         |   23 +-
 llvm/test/CodeGen/AMDGPU/idot4s.ll        | 2514 +++++++++++++++++-
 llvm/test/CodeGen/AMDGPU/idot4u.ll        | 2843 +++++++++++++++++++--
 5 files changed, 5412 insertions(+), 383 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/ByteProvider.h b/llvm/include/llvm/CodeGen/ByteProvider.h
index 3187b4e68c56f3a..99ae8607c0b2071 100644
--- a/llvm/include/llvm/CodeGen/ByteProvider.h
+++ b/llvm/include/llvm/CodeGen/ByteProvider.h
@@ -32,6 +32,11 @@ template <typename ISelOp> class ByteProvider {
   ByteProvider(std::optional<ISelOp> Src, int64_t DestOffset, int64_t SrcOffset)
       : Src(Src), DestOffset(DestOffset), SrcOffset(SrcOffset) {}
 
+  ByteProvider(std::optional<ISelOp> Src, int64_t DestOffset, int64_t SrcOffset,
+               std::optional<bool> IsSigned)
+      : Src(Src), DestOffset(DestOffset), SrcOffset(SrcOffset),
+        IsSigned(IsSigned) {}
+
   // TODO -- use constraint in c++20
   // Does this type correspond with an operation in selection DAG
   template <typename T> class is_op {
@@ -61,6 +66,9 @@ template <typename ISelOp> class ByteProvider {
   // DestOffset
   int64_t SrcOffset = 0;
 
+  // Whether or not Src be treated as signed
+  std::optional<bool> IsSigned;
+
   ByteProvider() = default;
 
   static ByteProvider getSrc(std::optional<ISelOp> Val, int64_t ByteOffset,
@@ -70,6 +78,14 @@ template <typename ISelOp> class ByteProvider {
     return ByteProvider(Val, ByteOffset, VectorOffset);
   }
 
+  static ByteProvider getSrc(std::optional<ISelOp> Val, int64_t ByteOffset,
+                             int64_t VectorOffset,
+                             std::optional<bool> IsSigned) {
+    static_assert(is_op<ISelOp>().value,
+                  "ByteProviders must contain an operation in selection DAG.");
+    return ByteProvider(Val, ByteOffset, VectorOffset, IsSigned);
+  }
+
   static ByteProvider getConstantZero() {
     return ByteProvider<ISelOp>(std::nullopt, 0, 0);
   }
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 1c85ec3f9f5212f..a620cbe239d8066 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -10652,6 +10652,7 @@ SDValue SITargetLowering::performAndCombine(SDNode *N,
 // performed.
 static const std::optional<ByteProvider<SDValue>>
 calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex = 0,
+                 std::optional<bool> IsSigned = std::nullopt,
                  unsigned Depth = 0) {
   // We may need to recursively traverse a series of SRLs
   if (Depth >= 6)
@@ -10663,12 +10664,16 @@ calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex = 0,
 
   switch (Op->getOpcode()) {
   case ISD::TRUNCATE: {
-    return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
+    return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, IsSigned,
+                            Depth + 1);
   }
 
   case ISD::SIGN_EXTEND:
   case ISD::ZERO_EXTEND:
   case ISD::SIGN_EXTEND_INREG: {
+    IsSigned = IsSigned.value_or(false) ||
+               Op->getOpcode() == ISD::SIGN_EXTEND ||
+               Op->getOpcode() == ISD::SIGN_EXTEND_INREG;
     SDValue NarrowOp = Op->getOperand(0);
     auto NarrowVT = NarrowOp.getValueType();
     if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG) {
@@ -10681,7 +10686,8 @@ calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex = 0,
 
     if (SrcIndex >= NarrowByteWidth)
       return std::nullopt;
-    return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
+    return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, IsSigned,
+                            Depth + 1);
   }
 
   case ISD::SRA:
@@ -10697,11 +10703,20 @@ calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex = 0,
 
     SrcIndex += BitShift / 8;
 
-    return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
+    return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, IsSigned,
+                            Depth + 1);
   }
 
   default: {
-    return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex);
+    if (auto A = dyn_cast<AtomicSDNode>(Op) || Op->isMemIntrinsic())
+      return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex);
+
+    if (auto L = dyn_cast<LoadSDNode>(Op))
+      if (L->getExtensionType() != ISD::NON_EXTLOAD)
+        IsSigned =
+            IsSigned.value_or(false) || L->getExtensionType() == ISD::SEXTLOAD;
+
+    return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex, IsSigned);
   }
   }
   llvm_unreachable("fully handled switch");
@@ -10715,7 +10730,8 @@ calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex = 0,
 // performed. \p StartingIndex is the originally requested byte of the Or
 static const std::optional<ByteProvider<SDValue>>
 calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth,
-                      unsigned StartingIndex = 0) {
+                      unsigned StartingIndex = 0,
+                      std::optional<bool> IsSigned = std::nullopt) {
   // Finding Src tree of RHS of or typically requires at least 1 additional
   // depth
   if (Depth > 6)
@@ -10730,11 +10746,11 @@ calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth,
   switch (Op.getOpcode()) {
   case ISD::OR: {
     auto RHS = calculateByteProvider(Op.getOperand(1), Index, Depth + 1,
-                                     StartingIndex);
+                                     StartingIndex, IsSigned);
     if (!RHS)
       return std::nullopt;
     auto LHS = calculateByteProvider(Op.getOperand(0), Index, Depth + 1,
-                                     StartingIndex);
+                                     StartingIndex, IsSigned);
     if (!LHS)
       return std::nullopt;
     // A well formed Or will have two ByteProviders for each byte, one of which
@@ -10765,7 +10781,7 @@ calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth,
       return ByteProvider<SDValue>::getConstantZero();
     }
 
-    return calculateSrcByte(Op->getOperand(0), StartingIndex, Index);
+    return calculateSrcByte(Op->getOperand(0), StartingIndex, Index, IsSigned);
   }
 
   case ISD::SRA:
@@ -10790,7 +10806,7 @@ calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth,
     // the SRL is Index + ByteShift
     return BytesProvided - ByteShift > Index
                ? calculateSrcByte(Op->getOperand(0), StartingIndex,
-                                  Index + ByteShift)
+                                  Index + ByteShift, IsSigned)
                : ByteProvider<SDValue>::getConstantZero();
   }
 
@@ -10811,7 +10827,7 @@ calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth,
     return Index < ByteShift
                ? ByteProvider<SDValue>::getConstantZero()
                : calculateByteProvider(Op.getOperand(0), Index - ByteShift,
-                                       Depth + 1, StartingIndex);
+                                       Depth + 1, StartingIndex, IsSigned);
   }
   case ISD::ANY_EXTEND:
   case ISD::SIGN_EXTEND:
@@ -10831,12 +10847,18 @@ calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth,
       return std::nullopt;
     uint64_t NarrowByteWidth = NarrowBitWidth / 8;
 
+    IsSigned = IsSigned.value_or(false) ||
+               Op->getOpcode() == ISD::SIGN_EXTEND ||
+               Op->getOpcode() == ISD::SIGN_EXTEND_INREG ||
+               Op->getOpcode() == ISD::AssertSext;
+
     if (Index >= NarrowByteWidth)
       return Op.getOpcode() == ISD::ZERO_EXTEND
                  ? std::optional<ByteProvider<SDValue>>(
                        ByteProvider<SDValue>::getConstantZero())
                  : std::nullopt;
-    return calculateByteProvider(NarrowOp, Index, Depth + 1, StartingIndex);
+    return calculateByteProvider(NarrowOp, Index, Depth + 1, StartingIndex,
+                                 IsSigned);
   }
 
   case ISD::TRUNCATE: {
@@ -10844,7 +10866,7 @@ calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth,
 
     if (NarrowByteWidth >= Index) {
       return calculateByteProvider(Op.getOperand(0), Index, Depth + 1,
-                                   StartingIndex);
+                                   StartingIndex, IsSigned);
     }
 
     return std::nullopt;
@@ -10852,13 +10874,18 @@ calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth,
 
   case ISD::CopyFromReg: {
     if (BitWidth / 8 > Index)
-      return calculateSrcByte(Op, StartingIndex, Index);
+      return calculateSrcByte(Op, StartingIndex, Index, IsSigned);
 
     return std::nullopt;
   }
 
   case ISD::LOAD: {
     auto L = cast<LoadSDNode>(Op.getNode());
+
+    // Only set IsSigned if the load is extended
+    if (L->getExtensionType() != ISD::NON_EXTLOAD)
+      IsSigned =
+          IsSigned.value_or(false) || L->getExtensionType() == ISD::SEXTLOAD;
     unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits();
     if (NarrowBitWidth % 8 != 0)
       return std::nullopt;
@@ -10875,7 +10902,7 @@ calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth,
     }
 
     if (NarrowByteWidth > Index) {
-      return calculateSrcByte(Op, StartingIndex, Index);
+      return calculateSrcByte(Op, StartingIndex, Index, IsSigned);
     }
 
     return std::nullopt;
@@ -10883,7 +10910,7 @@ calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth,
 
   case ISD::BSWAP:
     return calculateByteProvider(Op->getOperand(0), BitWidth / 8 - Index - 1,
-                                 Depth + 1, StartingIndex);
+                                 Depth + 1, StartingIndex, IsSigned);
 
   case ISD::EXTRACT_VECTOR_ELT: {
     auto IdxOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
@@ -10898,7 +10925,7 @@ calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth,
     }
 
     return calculateSrcByte(ScalarSize == 32 ? Op : Op.getOperand(0),
-                            StartingIndex, Index);
+                            StartingIndex, Index, IsSigned);
   }
 
   case AMDGPUISD::PERM: {
@@ -10914,9 +10941,10 @@ calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth,
     auto NextOp = Op.getOperand(IdxMask > 0x03 ? 0 : 1);
     auto NextIndex = IdxMask > 0x03 ? IdxMask % 4 : IdxMask;
 
-    return IdxMask != 0x0c ? calculateSrcByte(NextOp, StartingIndex, NextIndex)
-                           : ByteProvider<SDValue>(
-                                 ByteProvider<SDValue>::getConstantZero());
+    return IdxMask != 0x0c
+               ? calculateSrcByte(NextOp, StartingIndex, NextIndex, IsSigned)
+               : ByteProvider<SDValue>(
+                     ByteProvider<SDValue>::getConstantZero());
   }
 
   default: {
@@ -12513,6 +12541,193 @@ SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N,
   return Accum;
 }
 
+// Collect the ultimate src of each of the mul24 node's operands, and confirm
+// each operand is 8 bytes.
+static std::optional<ByteProvider<SDValue>>
+handleMulOperand(const SDValue &MulOperand) {
+  auto Byte0 = calculateByteProvider(MulOperand, 0, 0);
+  if (!Byte0 || Byte0->isConstantZero()) {
+    return std::nullopt;
+  }
+  auto Byte1 = calculateByteProvider(MulOperand, 1, 0);
+  if (Byte1 && !Byte1->isConstantZero()) {
+    return std::nullopt;
+  }
+  return Byte0;
+}
+
+static unsigned addPermMasks(unsigned First, unsigned Second) {
+  unsigned FirstCs = First & 0x0c0c0c0c;
+  unsigned SecondCs = Second & 0x0c0c0c0c;
+  unsigned FirstNoCs = First & ~0x0c0c0c0c;
+  unsigned SecondNoCs = Second & ~0x0c0c0c0c;
+
+  assert(FirstCs & 0xFF | SecondCs & 0xFF);
+  assert(FirstCs & 0xFF00 | SecondCs & 0xFF00);
+  assert(FirstCs & 0xFF0000 | SecondCs & 0xFF0000);
+  assert(FirstCs & 0xFF000000 | SecondCs & 0xFF000000);
+
+  return (FirstNoCs | SecondNoCs) | (FirstCs & SecondCs);
+}
+
+static void placeSources(ByteProvider<SDValue> &Src0,
+                         ByteProvider<SDValue> &Src1,
+                         SmallVectorImpl<std::pair<SDValue, unsigned>> &Src0s,
+                         SmallVectorImpl<std::pair<SDValue, unsigned>> &Src1s,
+                         int Step) {
+
+  assert(Src0.Src.has_value() && Src1.Src.has_value());
+  // Src0s and Src1s are empty, just place arbitrarily
+  if (Step == 0) {
+    Src0s.push_back({*Src0.Src, (Src0.SrcOffset << 24) + 0x0c0c0c});
+    Src1s.push_back({*Src1.Src, (Src1.SrcOffset << 24) + 0x0c0c0c});
+    return;
+  }
+
+  for (int BPI = 0; BPI < 2; BPI++) {
+    std::pair<ByteProvider<SDValue>, ByteProvider<SDValue>> BPP = {Src0, Src1};
+    if (BPI == 1) {
+      BPP = {Src1, Src0};
+    }
+    unsigned ZeroMask = 0x0c0c0c0c;
+    unsigned FMask = 0xFF << (8 * (3 - Step));
+
+    unsigned FirstMask =
+        BPP.first.SrcOffset << (8 * (3 - Step)) | (ZeroMask & ~FMask);
+    unsigned SecondMask =
+        BPP.second.SrcOffset << (8 * (3 - Step)) | (ZeroMask & ~FMask);
+    // Attempt to find Src vector which contains our SDValue, if so, add our
+    // perm mask to the existing one. If we are unable to find a match for the
+    // first SDValue, attempt to find match for the second.
+    int FirstGroup = -1;
+    for (int I = 0; I < 2; I++) {
+      SmallVectorImpl<std::pair<SDValue, unsigned>> &Srcs =
+          I == 0 ? Src0s : Src1s;
+      auto MatchesFirst = [&BPP](std::pair<SDValue, unsigned> IterElt) {
+        return IterElt.first == *BPP.first.Src;
+      };
+
+      auto Match = std::find_if(Srcs.begin(), Srcs.end(), MatchesFirst);
+      if (Match != Srcs.end()) {
+        Match->second = addPermMasks(FirstMask, Match->second);
+        FirstGroup = I;
+        break;
+      }
+    }
+    if (FirstGroup != -1) {
+      SmallVectorImpl<std::pair<SDValue, unsigned>> &Srcs =
+          FirstGroup == 1 ? Src0s : Src1s;
+      auto MatchesSecond = [&BPP](std::pair<SDValue, unsigned> IterElt) {
+        return IterElt.first == *BPP.second.Src;
+      };
+      auto Match = std::find_if(Srcs.begin(), Srcs.end(), MatchesSecond);
+      if (Match != Srcs.end()) {
+        Match->second = addPermMasks(SecondMask, Match->second);
+      } else
+        Srcs.push_back({*BPP.second.Src, SecondMask});
+      return;
+    }
+  }
+
+  // If we have made it here, then we could not find a match in Src0s or Src1s
+  // for either Src0 or Src1, so just place them arbitrarily.
+
+  unsigned ZeroMask = 0x0c0c0c0c;
+  unsigned FMask = 0xFF << (8 * (3 - Step));
+
+  Src0s.push_back(
+      {*Src0.Src, (Src0.SrcOffset << (8 * (3 - Step)) | (ZeroMask & ~FMask))});
+  Src1s.push_back(
+      {*Src1.Src, (Src1.SrcOffset << (8 * (3 - Step)) | (ZeroMask & ~FMask))});
+
+  return;
+}
+
+static SDValue
+resolveSources(SelectionDAG &DAG, SDLoc SL,
+               SmallVectorImpl<std::pair<SDValue, unsigned>> &Srcs,
+               bool IsSigned, bool IsAny) {
+
+  // If we just have one source, just permute it accordingly.
+  if (Srcs.size() == 1) {
+    auto Elt = Srcs.begin();
+    auto EltVal = DAG.getBitcastedAnyExtOrTrunc(Elt->first, SL, MVT::i32);
+
+    // v_perm will produce the original value
+    if (Elt->second == 0x3020100)
+      return EltVal;
+
+    return DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltVal, EltVal,
+                       DAG.getConstant(Elt->second, SL, MVT::i32));
+  }
+
+  auto FirstElt = Srcs.begin();
+  auto SecondElt = std::next(FirstElt);
+
+  SmallVector<SDValue, 2> Perms;
+
+  // If we have multiple sources in the chain, combine them via perms (using
+  // calculated perm mask) and Ors.
+  while (true) {
+    auto FirstMask = FirstElt->second;
+    auto SecondMask = SecondElt->second;
+
+    unsigned FirstCs = FirstMask & 0x0c0c0c0c;
+    unsigned FirstPlusFour = FirstMask | 0x04040404;
+    // 0x0c + 0x04 = 0x10, so anding with 0x0F will produced 0x00 for any
+    // original 0x0C
+    FirstMask = (FirstPlusFour & 0x0F0F0F0F) | FirstCs;
+
+    auto PermMask = addPermMasks(FirstMask, SecondMask);
+    auto FirstVal =
+        DAG.getBitcastedAnyExtOrTrunc(FirstElt->first, SL, MVT::i32);
+    auto SecondVal =
+        DAG.getBitcastedAnyExtOrTrunc(SecondElt->first, SL, MVT::i32);
+
+    Perms.push_back(DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, FirstVal,
+                                SecondVal,
+                                DAG.getConstant(PermMask, SL, MVT::i32)));
+
+    FirstElt = std::next(SecondElt);
+    if (FirstElt == Srcs.end())
+      break;
+
+    SecondElt = std::next(FirstElt);
+    // If we only have a FirstElt, then just combine that into the cumulative
+    // source node
+    if (SecondElt == Srcs.end()) {
+      auto EltVal =
+          DAG.getBitcastedAnyExtOrTrunc(FirstElt->first, SL, MVT::i32);
+
+      Perms.push_back(
+          DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltVal, EltVal,
+                      DAG.getConstant(FirstElt->second, SL, MVT::i32)));
+      break;
+    }
+  }
+
+  assert(Perms.size() == 1 || Perms.size() == 2);
+  return Perms.size() == 2
+             ? DAG.getNode(ISD::OR, SL, MVT::i32, Perms[0], Perms[1])
+             : Perms[0];
+}
+
+static void fixMasks(SmallVectorImpl<std::pair<SDValue, unsigned>> &Srcs,
+                     unsigned ChainLength) {
+  for (auto &[EntryVal, EntryMask] : Srcs) {
+    EntryMask = EntryMask >> ((4 - ChainLength) * 8);
+    auto ZeroMask = ChainLength == 2 ? 0x0c0c0000 : 0x0c000000;
+    EntryMask += ZeroMask;
+  }
+}
+
+static bool isMul(const SDValue Op) {
+  auto Opcode = Op.getOpcode();
+
+  return (Opcode == ISD::MUL || Opcode == AMDGPUISD::MUL_U24 ||
+          Opcode == AMDGPUISD::MUL_I24);
+}
+
 SDValue SITargetLowering::performAddCombine(SDNode *N,
                                             DAGCombinerInfo &DCI) const {
   SelectionDAG &DAG = DCI.DAG;
@@ -12526,14 +12741,156 @@ SDValue SITargetLowering::performAddCombine(SDNode *N,
       if (SDValue Folded = tryFoldToMad64_32(N, DCI))
         return Folded;
     }
-
-    return SDValue();
   }
 
   if (SDValue V = reassociateScalarOps(N, DAG)) {
     return V;
   }
 
+  if ((isMul(LHS) || isMul(RHS)) && Subtarget->hasDot7Insts() &&
+      (Subtarget->hasDot1Insts() || Subtarget->hasDot8Insts())) {
+    SDValue TempNode(N, 0);
+    auto MulIdx = isMul(LHS) ? 0 : 1;
+
+    auto MulOpcode = TempNode.getOperand(MulIdx).getOpcode();
+    std::optional<bool> IsSigned;
+    SmallVector<std::pair<SDValue, unsigned>, 4> Src0s;
+    SmallVector<std::pair<SDValue, unsigned>, 4> Src1s;
+    SmallVector<SDValue, 4> Src2s;
+
+    // Match the v_dot4 tree, while collecting src nodes.
+    int ChainLength = 0;
+    for (int I = 0; I < 4; I++) {
+      auto MulIdx = isMul(LHS) ? 0 : isMul(RHS) ? 1 : -1;
+      if (MulIdx == -1)
+        break;
+      auto IterIsSigned =
+          MulOpcode == AMDGPUISD::MUL_I24 ||
+          (MulOpcode == ISD::MUL &&
+           TempNode->getOperand(MulIdx)->getFlags().hasNoSignedWrap() &&
+           !TempNode->getOperand(MulIdx)->getFlags().hasNoUnsignedWrap());
+      auto Src0 = handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(0));
+      if (!Src0)
+        break;
+      auto Src1 = handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(1));
+      if (!Src1)
+        break;
+      if ((Src0->IsSigned != Src1->IsSigned) ||
+          (Src0->IsSigned && Src1->IsSigned &&
+           (*Src0->IsSigned != *Src1->IsSigned)))
+        break;
+      IterIsSigned |= Src0->IsSigned.value_or(false);
+      if (!IsSigned)
+        IsSigned = IterIsSigned;
+      if (IterIsSigned != *IsSigned)
+        break;
+      placeSources(*Src0, *Src1, Src0s, Src1s, I);
+      auto AddIdx = 1 - MulIdx;
+      // Allow the special case where add (add (mul24, 0), mul24) became ->
+      // add (mul24, mul24)
+      if (I == 2 && isMul(TempNode->getOperand(AddIdx))) {
+        Src2s.push_back(TempNode->getOperand(AddIdx));
+        auto Src0 =
+            handleMulOperand(TempNode->getOperand(AddIdx)->getOperand(0));
+        if (!Src0)
+          break;
+        auto Src1 =
+            handleMulOperand(TempNode->getOperand(AddIdx)->getOperand(1));
+        if (!Src1)
+          break;
+        if ((Src0->IsSigned != Src1->IsSigned) ||
+            (Src0->IsSigned && Src1->IsSigned &&
+             (*Src0->IsSigned != *Src1->IsSigned)))
+          break;
+        auto IterIsSigned =
+            MulOpcode == AMDGPUISD::MUL_I24 ||
+            (MulOpcode == ISD::MUL &&
+             TempNode->getOperand(MulIdx)->getFlags().hasNoSignedWrap() &&
+             !TempNode->getOperand(MulIdx)->getFlags().hasNoUnsignedWrap());
+        IterIsSigned |= Src0->IsSigned.value_or(false);
+        assert(IsSigned);
+        if (IterIsSigned != *IsSigned)
+          break;
+        placeSources(*Src0, *Src1, Src0s, Src1s, I + 1);
+        Src2s.push_back(DAG.getConstant(0, SL, MVT::i32));
+        ChainLength = I + 2;
+        break;
+      }
+
+      TempNode = TempNode->getOperand(AddIdx);
+      Src2s.push_back(TempNode);
+      ChainLength = I + 1;
+      if (TempNode->getNumOperands() < 2)
+        break;
+      LHS = TempNode->getOperand(0);
+      RHS = TempNode->getOperand(1);
+    }
+
+    if (ChainLength < 2)
+      return SDValue();
+
+    // Masks were constructed with assumption that we would find a chain of
+    // length 4. If not, then we need to 0 out the MSB bits (via perm mask of
+    // 0x0c) so they do not affect dot calculation.
+    if (ChainLength < 4) {
+      fixMasks(Src0s, ChainLength);
+      fixMasks(Src1s, ChainLength);
+    }
+
+    SDValue Src0, Src1;
+
+    // If we are just using a single source for both, and have permuted the
+    // bytes consistently, we can just use the sources without permuting
+    // (commutation)
+    bool UseOriginalSrc = false;
+    if (ChainLength == 4 && Src0s.size() == 1 && Src1s.size() == 1 &&
+        Src0s.begin()->second == Src1s.begin()->second &&
+        Src0s.begin()->first.getValueSizeInBits() == 32 &&
+        Src1s.begin()->first.getValueSizeInBits() == 32) {
+      SmallVector<unsigned, 4> SrcBytes;
+      auto Src0Mask = Src0s.begin()->second;
+      SrcBytes.push_back(Src0Mask & 0xFF000000);
+      bool UniqueEntries = true;
+      for (auto I = 1; I < 4; I++) {
+        auto NextByte = Src0Mask & (0xFF << ((3 - I) * 8));
+
+        if (is_contained(SrcBytes, NextByte)) {
+          UniqueEntries = false;
+          break;
+        }
+        SrcBytes.push_back(NextByte);
+      }
+
+      if (UniqueEntries) {
+        UseOriginalSrc = true;
+        // Must be 32 bits to enter above conditional
+        assert(Src0s.begin()->first.getValueSizeInBits() == 32);
+        assert(Src1s.begin()->first.getValueSizeInBits() == 32);
+        Src0 = DAG.getBitcast(MVT::getIntegerVT(32), Src0s.begin()->first);
+        Src1 = DAG.getBitcast(MVT::getIntegerVT(32), Src1s.begin()->first);
+      }
+    }
+
+    if (!UseOriginalSrc) {
+      Src0 = resolveSources(DAG, SL, Src0s, false, true);
+      Src1 = resolveSources(DAG, SL, Src1s, false, true);
+    }
+
+    assert(IsSigned);
+    SDValue Src2 =
+        DAG.getExtOrTrunc(*IsSigned, Src2s[ChainLength - 1], SL, MVT::i32);
+
+    SDValue IID = DAG.getTargetConstant(*IsSigned ? Intrinsic::amdgcn_sdot4
+                                                  : Intrinsic::amdgcn_udot4,
+                                        SL, MVT::i64);
+
+    assert(!VT.isVector());
+    auto Dot = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32, IID, Src0,
+                           Src1, Src2, DAG.getTargetConstant(0, SL, MVT::i1));
+
+    return DAG.getExtOrTrunc(*IsSigned, Dot, SL, VT);
+  }
+
   if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG())
     return SDValue();
 
diff --git a/llvm/test/CodeGen/AMDGPU/idot2.ll b/llvm/test/CodeGen/AMDGPU/idot2.ll
index ccde5efce08dc8e..56f72ac9d9e8c6d 100644
--- a/llvm/test/CodeGen/AMDGPU/idot2.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot2.ll
@@ -2823,18 +2823,18 @@ define amdgpu_kernel void @notsdot2_sext8(ptr addrspace(1) %src1,
 ; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
 ; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX9-DL-NEXT:    s_mov_b32 s1, 0xc0c0001
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DL-NEXT:    global_load_ushort v1, v0, s[4:5]
 ; GFX9-DL-NEXT:    global_load_ushort v2, v0, s[6:7]
 ; GFX9-DL-NEXT:    s_load_dword s0, s[2:3], 0x0
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-DL-NEXT:    v_perm_b32 v1, v1, v1, s1
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT:    v_mul_i32_i24_sdwa v3, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
-; GFX9-DL-NEXT:    v_lshrrev_b16_e32 v1, 8, v1
-; GFX9-DL-NEXT:    v_lshrrev_b16_e32 v2, 8, v2
-; GFX9-DL-NEXT:    v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX9-DL-NEXT:    v_perm_b32 v2, v2, v2, s1
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    v_add3_u32 v1, v1, s0, v3
+; GFX9-DL-NEXT:    v_dot4_i32_i8 v1, v2, v1, s0
 ; GFX9-DL-NEXT:    global_store_dword v0, v1, s[2:3]
 ; GFX9-DL-NEXT:    s_endpgm
 ;
@@ -2843,21 +2843,20 @@ define amdgpu_kernel void @notsdot2_sext8(ptr addrspace(1) %src1,
 ; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-DL-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    s_clause 0x1
 ; GFX10-DL-NEXT:    global_load_ushort v1, v0, s[4:5]
 ; GFX10-DL-NEXT:    global_load_ushort v2, v0, s[6:7]
 ; GFX10-DL-NEXT:    s_load_dword s2, s[0:1], 0x0
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-DL-NEXT:    v_lshrrev_b16 v0, 8, v1
+; GFX10-DL-NEXT:    v_perm_b32 v0, v1, v1, 0xc0c0001
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT:    v_lshrrev_b16 v3, 8, v2
-; GFX10-DL-NEXT:    v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
-; GFX10-DL-NEXT:    v_mov_b32_e32 v2, 0
-; GFX10-DL-NEXT:    v_mul_i32_i24_sdwa v0, sext(v3), sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX10-DL-NEXT:    v_perm_b32 v1, v2, v2, 0xc0c0001
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    v_add3_u32 v0, v0, s2, v1
-; GFX10-DL-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-DL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-DL-NEXT:    v_dot4c_i32_i8_e32 v2, v1, v0
+; GFX10-DL-NEXT:    global_store_dword v3, v2, s[0:1]
 ; GFX10-DL-NEXT:    s_endpgm
                                           ptr addrspace(1) %src2,
                                           ptr addrspace(1) nocapture %dst) {
diff --git a/llvm/test/CodeGen/AMDGPU/idot4s.ll b/llvm/test/CodeGen/AMDGPU/idot4s.ll
index ea22aaee761c8dd..5c44ba008df04e1 100644
--- a/llvm/test/CodeGen/AMDGPU/idot4s.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot4s.ll
@@ -5,6 +5,7 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9-DL %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-DL %s
 
 define amdgpu_kernel void @idot4_acc32(ptr addrspace(1) %src1,
 ; GFX7-LABEL: idot4_acc32:
@@ -117,16 +118,36 @@ define amdgpu_kernel void @idot4_acc32(ptr addrspace(1) %src1,
 ; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-DL-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    s_clause 0x1
 ; GFX10-DL-NEXT:    global_load_dword v1, v0, s[4:5]
 ; GFX10-DL-NEXT:    global_load_dword v2, v0, s[6:7]
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-DL-NEXT:    s_load_dword s2, s[0:1], 0x0
-; GFX10-DL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-DL-NEXT:    v_dot4_i32_i8 v1, v1, v2, s2
-; GFX10-DL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DL-NEXT:    v_dot4c_i32_i8_e32 v0, v1, v2
+; GFX10-DL-NEXT:    global_store_dword v3, v0, s[0:1]
 ; GFX10-DL-NEXT:    s_endpgm
+;
+; GFX11-DL-LABEL: idot4_acc32:
+; GFX11-DL:       ; %bb.0: ; %entry
+; GFX11-DL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-DL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-DL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_clause 0x1
+; GFX11-DL-NEXT:    global_load_b32 v1, v0, s[4:5]
+; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[6:7]
+; GFX11-DL-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-DL-NEXT:    v_dot4_i32_iu8 v0, v1, v0, s2 neg_lo:[1,1,0]
+; GFX11-DL-NEXT:    global_store_b32 v2, v0, s[0:1]
+; GFX11-DL-NEXT:    s_nop 0
+; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DL-NEXT:    s_endpgm
                                        ptr addrspace(1) %src2,
                                        ptr addrspace(1) nocapture %dst) {
 entry:
@@ -169,8 +190,6 @@ entry:
   ret void
 }
 
-; TODO: Currently, vector elements{0 and 3} get zero_extended from i16 to i32 which should
-; be sign_extended directly to i32; prevents the pattern recognizer to recognize this pattern.
 define amdgpu_kernel void @idot4_acc16(ptr addrspace(1) %src1,
 ; GFX7-LABEL: idot4_acc16:
 ; GFX7:       ; %bb.0: ; %entry
@@ -294,33 +313,14 @@ define amdgpu_kernel void @idot4_acc16(ptr addrspace(1) %src1,
 ; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
 ; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-DL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
-; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
-; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-DL-NEXT:    global_load_ushort v3, v0, s[2:3]
-; GFX9-DL-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-DL-NEXT:    v_bfe_i32 v6, v1, 0, 8
-; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-DL-NEXT:    v_bfe_i32 v7, v2, 0, 8
-; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v8, 8, v1
-; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v9, 8, v2
-; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
-; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
-; GFX9-DL-NEXT:    v_bfe_i32 v8, v8, 0, 8
-; GFX9-DL-NEXT:    v_bfe_i32 v9, v9, 0, 8
+; GFX9-DL-NEXT:    global_load_dword v2, v0, s[4:5]
+; GFX9-DL-NEXT:    global_load_dword v3, v0, s[6:7]
+; GFX9-DL-NEXT:    global_load_sshort v4, v1, s[2:3]
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT:    v_mad_legacy_u16 v3, v6, v7, v3
-; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v1, 24, v1
-; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
-; GFX9-DL-NEXT:    v_bfe_i32 v4, v4, 0, 8
-; GFX9-DL-NEXT:    v_bfe_i32 v5, v5, 0, 8
-; GFX9-DL-NEXT:    v_mad_legacy_u16 v3, v8, v9, v3
-; GFX9-DL-NEXT:    v_bfe_i32 v1, v1, 0, 8
-; GFX9-DL-NEXT:    v_bfe_i32 v2, v2, 0, 8
-; GFX9-DL-NEXT:    v_mad_legacy_u16 v3, v4, v5, v3
-; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v1, v2, v3
-; GFX9-DL-NEXT:    global_store_short v0, v1, s[2:3]
+; GFX9-DL-NEXT:    v_dot4_i32_i8 v0, v2, v3, v4
+; GFX9-DL-NEXT:    global_store_short v1, v0, s[2:3]
 ; GFX9-DL-NEXT:    s_endpgm
 ;
 ; GFX10-DL-LABEL: idot4_acc16:
@@ -329,35 +329,34 @@ define amdgpu_kernel void @idot4_acc16(ptr addrspace(1) %src1,
 ; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
 ; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    s_clause 0x1
-; GFX10-DL-NEXT:    global_load_dword v1, v0, s[4:5]
-; GFX10-DL-NEXT:    global_load_dword v2, v0, s[6:7]
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX10-DL-NEXT:    global_load_ushort v3, v0, s[2:3]
-; GFX10-DL-NEXT:    s_waitcnt vmcnt(2)
-; GFX10-DL-NEXT:    v_bfe_i32 v4, v1, 0, 8
-; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
-; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v6, 8, v2
-; GFX10-DL-NEXT:    v_bfe_i32 v7, v2, 0, 8
-; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v8, 16, v1
-; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v9, 16, v2
-; GFX10-DL-NEXT:    v_bfe_i32 v5, v5, 0, 8
-; GFX10-DL-NEXT:    v_bfe_i32 v6, v6, 0, 8
+; GFX10-DL-NEXT:    global_load_dword v2, v0, s[4:5]
+; GFX10-DL-NEXT:    global_load_dword v3, v0, s[6:7]
+; GFX10-DL-NEXT:    global_load_sshort v4, v1, s[2:3]
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT:    v_mad_u16 v3, v4, v7, v3
-; GFX10-DL-NEXT:    v_bfe_i32 v4, v8, 0, 8
-; GFX10-DL-NEXT:    v_bfe_i32 v7, v9, 0, 8
-; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v1, 24, v1
-; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
-; GFX10-DL-NEXT:    v_mad_u16 v3, v5, v6, v3
-; GFX10-DL-NEXT:    v_bfe_i32 v1, v1, 0, 8
-; GFX10-DL-NEXT:    v_bfe_i32 v2, v2, 0, 8
-; GFX10-DL-NEXT:    v_mad_u16 v3, v4, v7, v3
-; GFX10-DL-NEXT:    v_mad_u16 v1, v1, v2, v3
-; GFX10-DL-NEXT:    global_store_short v0, v1, s[2:3]
+; GFX10-DL-NEXT:    v_dot4c_i32_i8_e32 v4, v2, v3
+; GFX10-DL-NEXT:    global_store_short v1, v4, s[2:3]
 ; GFX10-DL-NEXT:    s_endpgm
+;
+; GFX11-DL-LABEL: idot4_acc16:
+; GFX11-DL:       ; %bb.0: ; %entry
+; GFX11-DL-NEXT:    s_clause 0x1
+; GFX11-DL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-DL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_clause 0x1
+; GFX11-DL-NEXT:    global_load_b32 v2, v0, s[4:5]
+; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[6:7]
+; GFX11-DL-NEXT:    global_load_i16 v3, v1, s[0:1]
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DL-NEXT:    v_dot4_i32_iu8 v0, v2, v0, v3 neg_lo:[1,1,0]
+; GFX11-DL-NEXT:    global_store_b16 v1, v0, s[0:1]
+; GFX11-DL-NEXT:    s_nop 0
+; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DL-NEXT:    s_endpgm
                                        ptr addrspace(1) %src2,
                                        ptr addrspace(1) nocapture %dst) {
 entry:
@@ -499,25 +498,14 @@ define amdgpu_kernel void @idot4_acc8(ptr addrspace(1) %src1,
 ; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
 ; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-DL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
-; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
-; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-DL-NEXT:    global_load_ubyte v3, v0, s[2:3]
-; GFX9-DL-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
-; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v6, 8, v1
-; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v7, 8, v2
-; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v8, 24, v1
+; GFX9-DL-NEXT:    global_load_dword v2, v0, s[4:5]
+; GFX9-DL-NEXT:    global_load_dword v3, v0, s[6:7]
+; GFX9-DL-NEXT:    global_load_ubyte v4, v1, s[2:3]
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v1, v2, v3
-; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
-; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v6, v7, v1
-; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v9, 24, v2
-; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v4, v5, v1
-; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v8, v9, v1
-; GFX9-DL-NEXT:    global_store_byte v0, v1, s[2:3]
+; GFX9-DL-NEXT:    v_dot4_u32_u8 v0, v2, v3, v4
+; GFX9-DL-NEXT:    global_store_byte v1, v0, s[2:3]
 ; GFX9-DL-NEXT:    s_endpgm
 ;
 ; GFX10-DL-LABEL: idot4_acc8:
@@ -532,21 +520,28 @@ define amdgpu_kernel void @idot4_acc8(ptr addrspace(1) %src1,
 ; GFX10-DL-NEXT:    global_load_dword v2, v0, s[4:5]
 ; GFX10-DL-NEXT:    global_load_dword v3, v0, s[6:7]
 ; GFX10-DL-NEXT:    global_load_ubyte v4, v1, s[2:3]
-; GFX10-DL-NEXT:    s_waitcnt vmcnt(2)
-; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v0, 8, v2
-; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v5, 8, v3
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT:    v_mad_u16 v4, v2, v3, v4
-; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
-; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
-; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
-; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v3, 24, v3
-; GFX10-DL-NEXT:    v_mad_u16 v0, v0, v5, v4
-; GFX10-DL-NEXT:    v_mad_u16 v0, v6, v7, v0
-; GFX10-DL-NEXT:    v_mad_u16 v0, v2, v3, v0
+; GFX10-DL-NEXT:    v_dot4_u32_u8 v0, v2, v3, v4
 ; GFX10-DL-NEXT:    global_store_byte v1, v0, s[2:3]
 ; GFX10-DL-NEXT:    s_endpgm
+;
+; GFX11-DL-LABEL: idot4_acc8:
+; GFX11-DL:       ; %bb.0: ; %entry
+; GFX11-DL-NEXT:    s_clause 0x1
+; GFX11-DL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-DL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_clause 0x1
+; GFX11-DL-NEXT:    global_load_b32 v2, v0, s[4:5]
+; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[6:7]
+; GFX11-DL-NEXT:    global_load_u8 v3, v1, s[0:1]
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DL-NEXT:    v_dot4_u32_u8 v0, v2, v0, v3
+; GFX11-DL-NEXT:    global_store_b8 v1, v0, s[0:1]
+; GFX11-DL-NEXT:    s_nop 0
+; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DL-NEXT:    s_endpgm
                                       ptr addrspace(1) %src2,
                                       ptr addrspace(1) nocapture %dst) {
 entry:
@@ -692,14 +687,9 @@ define amdgpu_kernel void @idot4_multiuse_mul1(ptr addrspace(1) %src1,
 ; GFX9-DL-NEXT:    v_bfe_i32 v3, v1, 0, 8
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-DL-NEXT:    v_bfe_i32 v4, v2, 0, 8
-; GFX9-DL-NEXT:    v_mul_i32_i24_sdwa v5, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
-; GFX9-DL-NEXT:    v_mul_i32_i24_sdwa v6, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
-; GFX9-DL-NEXT:    v_mul_i32_i24_sdwa v1, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
-; GFX9-DL-NEXT:    v_mul_i32_i24_e32 v2, v3, v4
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DL-NEXT:    v_mad_i32_i24 v3, v3, v4, s0
-; GFX9-DL-NEXT:    v_add3_u32 v2, v5, v3, v2
-; GFX9-DL-NEXT:    v_add3_u32 v1, v2, v6, v1
+; GFX9-DL-NEXT:    v_dot4_i32_i8 v1, v1, v2, v3
 ; GFX9-DL-NEXT:    global_store_dword v0, v1, s[2:3]
 ; GFX9-DL-NEXT:    s_endpgm
 ;
@@ -717,17 +707,36 @@ define amdgpu_kernel void @idot4_multiuse_mul1(ptr addrspace(1) %src1,
 ; GFX10-DL-NEXT:    v_bfe_i32 v0, v1, 0, 8
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-DL-NEXT:    v_bfe_i32 v3, v2, 0, 8
-; GFX10-DL-NEXT:    v_mul_i32_i24_sdwa v4, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
-; GFX10-DL-NEXT:    v_mul_i32_i24_e32 v5, v0, v3
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    v_mad_i32_i24 v0, v0, v3, s2
-; GFX10-DL-NEXT:    v_mul_i32_i24_sdwa v3, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
-; GFX10-DL-NEXT:    v_mul_i32_i24_sdwa v1, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
-; GFX10-DL-NEXT:    v_mov_b32_e32 v2, 0
-; GFX10-DL-NEXT:    v_add3_u32 v0, v4, v0, v5
-; GFX10-DL-NEXT:    v_add3_u32 v0, v0, v3, v1
-; GFX10-DL-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-DL-NEXT:    v_mov_b32_e32 v3, 0
+; GFX10-DL-NEXT:    v_dot4c_i32_i8_e32 v0, v1, v2
+; GFX10-DL-NEXT:    global_store_dword v3, v0, s[0:1]
 ; GFX10-DL-NEXT:    s_endpgm
+;
+; GFX11-DL-LABEL: idot4_multiuse_mul1:
+; GFX11-DL:       ; %bb.0: ; %entry
+; GFX11-DL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-DL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_clause 0x1
+; GFX11-DL-NEXT:    global_load_b32 v1, v0, s[4:5]
+; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[6:7]
+; GFX11-DL-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-DL-NEXT:    v_bfe_i32 v2, v1, 0, 8
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DL-NEXT:    v_bfe_i32 v3, v0, 0, 8
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-DL-NEXT:    v_mad_i32_i24 v2, v2, v3, s2
+; GFX11-DL-NEXT:    v_mov_b32_e32 v3, 0
+; GFX11-DL-NEXT:    v_dot4_i32_iu8 v0, v1, v0, v2 neg_lo:[1,1,0]
+; GFX11-DL-NEXT:    global_store_b32 v3, v0, s[0:1]
+; GFX11-DL-NEXT:    s_nop 0
+; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DL-NEXT:    s_endpgm
                                                ptr addrspace(1) %src2,
                                                ptr addrspace(1) nocapture %dst) {
 entry:
@@ -772,7 +781,6 @@ entry:
   ret void
 }
 
-; TODO: Support this pattern.
 define amdgpu_kernel void @idot4_acc32_vecMul(ptr addrspace(1) %src1,
 ; GFX7-LABEL: idot4_acc32_vecMul:
 ; GFX7:       ; %bb.0: ; %entry
@@ -879,17 +887,8 @@ define amdgpu_kernel void @idot4_acc32_vecMul(ptr addrspace(1) %src1,
 ; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
 ; GFX9-DL-NEXT:    s_load_dword s0, s[2:3], 0x0
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-DL-NEXT:    v_lshrrev_b16_e32 v3, 8, v1
-; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT:    v_lshrrev_b16_e32 v4, 8, v2
-; GFX9-DL-NEXT:    v_mul_i32_i24_sdwa v5, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
-; GFX9-DL-NEXT:    v_mul_i32_i24_sdwa v3, sext(v3), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
-; GFX9-DL-NEXT:    v_mul_i32_i24_sdwa v6, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
-; GFX9-DL-NEXT:    v_mul_i32_i24_sdwa v1, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
-; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    v_add3_u32 v2, v5, s0, v3
-; GFX9-DL-NEXT:    v_add3_u32 v1, v2, v6, v1
+; GFX9-DL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-DL-NEXT:    v_dot4_i32_i8 v1, v1, v2, s0
 ; GFX9-DL-NEXT:    global_store_dword v0, v1, s[2:3]
 ; GFX9-DL-NEXT:    s_endpgm
 ;
@@ -898,25 +897,36 @@ define amdgpu_kernel void @idot4_acc32_vecMul(ptr addrspace(1) %src1,
 ; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-DL-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    s_clause 0x1
 ; GFX10-DL-NEXT:    global_load_dword v1, v0, s[4:5]
 ; GFX10-DL-NEXT:    global_load_dword v2, v0, s[6:7]
 ; GFX10-DL-NEXT:    s_load_dword s2, s[0:1], 0x0
-; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-DL-NEXT:    v_lshrrev_b16 v0, 8, v1
-; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT:    v_lshrrev_b16 v3, 8, v2
-; GFX10-DL-NEXT:    v_mul_i32_i24_sdwa v4, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
-; GFX10-DL-NEXT:    v_mul_i32_i24_sdwa v0, sext(v0), sext(v3) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
-; GFX10-DL-NEXT:    v_mul_i32_i24_sdwa v3, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
-; GFX10-DL-NEXT:    v_mul_i32_i24_sdwa v1, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
-; GFX10-DL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    v_add3_u32 v0, v4, s2, v0
-; GFX10-DL-NEXT:    v_add3_u32 v0, v0, v3, v1
-; GFX10-DL-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DL-NEXT:    v_dot4c_i32_i8_e32 v0, v1, v2
+; GFX10-DL-NEXT:    global_store_dword v3, v0, s[0:1]
 ; GFX10-DL-NEXT:    s_endpgm
+;
+; GFX11-DL-LABEL: idot4_acc32_vecMul:
+; GFX11-DL:       ; %bb.0: ; %entry
+; GFX11-DL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-DL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-DL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_clause 0x1
+; GFX11-DL-NEXT:    global_load_b32 v1, v0, s[4:5]
+; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[6:7]
+; GFX11-DL-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-DL-NEXT:    v_dot4_i32_iu8 v0, v1, v0, s2 neg_lo:[1,1,0]
+; GFX11-DL-NEXT:    global_store_b32 v2, v0, s[0:1]
+; GFX11-DL-NEXT:    s_nop 0
+; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DL-NEXT:    s_endpgm
                                               ptr addrspace(1) %src2,
                                               ptr addrspace(1) nocapture %dst) {
 entry:
@@ -1137,6 +1147,53 @@ define amdgpu_kernel void @idot4_acc16_vecMul(ptr addrspace(1) %src1,
 ; GFX10-DL-NEXT:    v_add_nc_u16 v1, v1, v3
 ; GFX10-DL-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX10-DL-NEXT:    s_endpgm
+;
+; GFX11-DL-LABEL: idot4_acc16_vecMul:
+; GFX11-DL:       ; %bb.0: ; %entry
+; GFX11-DL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-DL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-DL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_clause 0x1
+; GFX11-DL-NEXT:    global_load_b32 v1, v0, s[4:5]
+; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[6:7]
+; GFX11-DL-NEXT:    global_load_u16 v3, v2, s[0:1]
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-DL-NEXT:    v_ashrrev_i16 v4, 8, v1
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-DL-NEXT:    v_ashrrev_i16 v5, 8, v0
+; GFX11-DL-NEXT:    v_bfe_i32 v6, v0, 0, 8
+; GFX11-DL-NEXT:    v_bfe_i32 v7, v1, 0, 8
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-DL-NEXT:    v_perm_b32 v5, v5, v6, 0x5040100
+; GFX11-DL-NEXT:    v_perm_b32 v4, v4, v7, 0x5040100
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-DL-NEXT:    v_ashrrev_i16 v6, 8, v1
+; GFX11-DL-NEXT:    v_ashrrev_i16 v7, 8, v0
+; GFX11-DL-NEXT:    v_bfe_i32 v0, v0, 0, 8
+; GFX11-DL-NEXT:    v_bfe_i32 v1, v1, 0, 8
+; GFX11-DL-NEXT:    v_pk_mul_lo_u16 v4, v4, v5
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-DL-NEXT:    v_perm_b32 v0, v7, v0, 0x5040100
+; GFX11-DL-NEXT:    v_perm_b32 v1, v6, v1, 0x5040100
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DL-NEXT:    v_add_nc_u16 v3, v4, v3
+; GFX11-DL-NEXT:    v_pk_mul_lo_u16 v0, v1, v0
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-DL-NEXT:    v_add_nc_u16 v1, v3, v5
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-DL-NEXT:    v_add_nc_u16 v0, v1, v0
+; GFX11-DL-NEXT:    v_add_nc_u16 v0, v0, v3
+; GFX11-DL-NEXT:    global_store_b16 v2, v0, s[0:1]
+; GFX11-DL-NEXT:    s_nop 0
+; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DL-NEXT:    s_endpgm
                                               ptr addrspace(1) %src2,
                                               ptr addrspace(1) nocapture %dst) {
 entry:
@@ -1165,4 +1222,2215 @@ entry:
   ret void
 }
 
+define amdgpu_kernel void @idot4_acc32_2ele(ptr addrspace(1) %src1,
+; GFX7-LABEL: idot4_acc32_2ele:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    s_mov_b32 s10, 0
+; GFX7-NEXT:    s_mov_b32 s11, s3
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX7-NEXT:    v_mov_b32_e32 v1, 0
+; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_bfe_i32 v1, v2, 0, 8
+; GFX7-NEXT:    v_bfe_i32 v2, v2, 8, 8
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_bfe_i32 v3, v0, 0, 8
+; GFX7-NEXT:    v_bfe_i32 v0, v0, 8, 8
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mad_i32_i24 v1, v1, v3, s4
+; GFX7-NEXT:    v_mad_i32_i24 v0, v2, v0, v1
+; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: idot4_acc32_2ele:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v0, v[0:1]
+; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX8-NEXT:    s_waitcnt vmcnt(1)
+; GFX8-NEXT:    v_bfe_i32 v1, v3, 0, 8
+; GFX8-NEXT:    v_bfe_i32 v3, v3, 8, 8
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_bfe_i32 v2, v0, 0, 8
+; GFX8-NEXT:    v_bfe_i32 v0, v0, 8, 8
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mad_i32_i24 v1, v1, v2, s2
+; GFX8-NEXT:    v_mad_i32_i24 v2, v3, v0, v1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    flat_store_dword v[0:1], v2
+; GFX8-NEXT:    s_endpgm
+;
+; GFX9-NODL-LABEL: idot4_acc32_2ele:
+; GFX9-NODL:       ; %bb.0: ; %entry
+; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NODL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[4:5]
+; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[6:7]
+; GFX9-NODL-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NODL-NEXT:    v_mul_i32_i24_sdwa v3, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX9-NODL-NEXT:    v_mul_i32_i24_sdwa v1, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
+; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NODL-NEXT:    v_add3_u32 v1, v3, s0, v1
+; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX9-NODL-NEXT:    s_endpgm
+;
+; GFX9-DL-LABEL: idot4_acc32_2ele:
+; GFX9-DL:       ; %bb.0: ; %entry
+; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-DL-NEXT:    s_mov_b32 s1, 0xc0c0100
+; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-DL-NEXT:    global_load_dword v2, v0, s[4:5]
+; GFX9-DL-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-DL-NEXT:    v_perm_b32 v1, v1, v1, s1
+; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-DL-NEXT:    v_perm_b32 v2, v2, v2, s1
+; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT:    v_dot4_i32_i8 v1, v2, v1, s0
+; GFX9-DL-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX9-DL-NEXT:    s_endpgm
+;
+; GFX10-DL-LABEL: idot4_acc32_2ele:
+; GFX10-DL:       ; %bb.0: ; %entry
+; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-DL-NEXT:    v_mov_b32_e32 v3, 0
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_clause 0x1
+; GFX10-DL-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX10-DL-NEXT:    global_load_dword v2, v0, s[4:5]
+; GFX10-DL-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-DL-NEXT:    v_perm_b32 v0, v1, v1, 0xc0c0100
+; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DL-NEXT:    v_perm_b32 v1, v2, v2, 0xc0c0100
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-DL-NEXT:    v_dot4c_i32_i8_e32 v2, v1, v0
+; GFX10-DL-NEXT:    global_store_dword v3, v2, s[0:1]
+; GFX10-DL-NEXT:    s_endpgm
+;
+; GFX11-DL-LABEL: idot4_acc32_2ele:
+; GFX11-DL:       ; %bb.0: ; %entry
+; GFX11-DL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-DL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-DL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_clause 0x1
+; GFX11-DL-NEXT:    global_load_b32 v1, v0, s[6:7]
+; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[4:5]
+; GFX11-DL-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-DL-NEXT:    v_perm_b32 v1, v1, v1, 0xc0c0100
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DL-NEXT:    v_perm_b32 v0, v0, v0, 0xc0c0100
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DL-NEXT:    v_dot4_i32_iu8 v0, v0, v1, s2 neg_lo:[1,1,0]
+; GFX11-DL-NEXT:    global_store_b32 v2, v0, s[0:1]
+; GFX11-DL-NEXT:    s_nop 0
+; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DL-NEXT:    s_endpgm
+                                       ptr addrspace(1) %src2,
+                                       ptr addrspace(1) nocapture %dst) {
+entry:
+  %idx = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
+  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
+  %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
+  %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
+
+  %v1e0 = extractelement <4 x i8> %vec1, i64 0
+  %cv1e0 = sext i8 %v1e0 to i32
+  %v2e0 = extractelement <4 x i8> %vec2, i64 0
+  %cv2e0 = sext i8 %v2e0 to i32
+  %mul1 = mul nuw nsw i32 %cv1e0, %cv2e0
+
+  %v1e1 = extractelement <4 x i8> %vec1, i64 1
+  %cv1e1 = sext i8 %v1e1 to i32
+  %v2e1 = extractelement <4 x i8> %vec2, i64 1
+  %cv2e1 = sext i8 %v2e1 to i32
+  %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1
+
+  %acc = load i32, ptr addrspace(1) %dst, align 4
+  %add1 = add i32 %mul1, %acc
+  %add2 = add i32 %add1, %mul2
+  store i32 %add2, ptr addrspace(1) %dst, align 4
+  ret void
+}
+
+
+define amdgpu_kernel void @idot4_acc32_3ele(ptr addrspace(1) %src1,
+; GFX7-LABEL: idot4_acc32_3ele:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    s_mov_b32 s10, 0
+; GFX7-NEXT:    s_mov_b32 s11, s3
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX7-NEXT:    v_mov_b32_e32 v1, 0
+; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_bfe_i32 v1, v2, 0, 8
+; GFX7-NEXT:    v_bfe_i32 v3, v2, 8, 8
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_bfe_i32 v4, v0, 0, 8
+; GFX7-NEXT:    v_bfe_i32 v5, v0, 8, 8
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mad_i32_i24 v1, v1, v4, s4
+; GFX7-NEXT:    v_bfe_i32 v2, v2, 16, 8
+; GFX7-NEXT:    v_bfe_i32 v0, v0, 16, 8
+; GFX7-NEXT:    v_mad_i32_i24 v1, v3, v5, v1
+; GFX7-NEXT:    v_mad_i32_i24 v0, v2, v0, v1
+; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: idot4_acc32_3ele:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v0, v[0:1]
+; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX8-NEXT:    s_waitcnt vmcnt(1)
+; GFX8-NEXT:    v_bfe_i32 v1, v3, 0, 8
+; GFX8-NEXT:    v_bfe_i32 v4, v3, 8, 8
+; GFX8-NEXT:    v_bfe_i32 v3, v3, 16, 8
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_bfe_i32 v2, v0, 0, 8
+; GFX8-NEXT:    v_bfe_i32 v5, v0, 8, 8
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mad_i32_i24 v1, v1, v2, s2
+; GFX8-NEXT:    v_bfe_i32 v0, v0, 16, 8
+; GFX8-NEXT:    v_mad_i32_i24 v1, v4, v5, v1
+; GFX8-NEXT:    v_mad_i32_i24 v2, v3, v0, v1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    flat_store_dword v[0:1], v2
+; GFX8-NEXT:    s_endpgm
+;
+; GFX9-NODL-LABEL: idot4_acc32_3ele:
+; GFX9-NODL:       ; %bb.0: ; %entry
+; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NODL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[4:5]
+; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[6:7]
+; GFX9-NODL-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NODL-NEXT:    v_bfe_i32 v3, v1, 0, 8
+; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NODL-NEXT:    v_bfe_i32 v4, v2, 0, 8
+; GFX9-NODL-NEXT:    v_mul_i32_i24_sdwa v5, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
+; GFX9-NODL-NEXT:    v_mul_i32_i24_sdwa v1, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
+; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NODL-NEXT:    v_mad_i32_i24 v2, v3, v4, s0
+; GFX9-NODL-NEXT:    v_add3_u32 v1, v2, v5, v1
+; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX9-NODL-NEXT:    s_endpgm
+;
+; GFX9-DL-LABEL: idot4_acc32_3ele:
+; GFX9-DL:       ; %bb.0: ; %entry
+; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-DL-NEXT:    s_mov_b32 s1, 0xc020100
+; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-DL-NEXT:    global_load_dword v2, v0, s[4:5]
+; GFX9-DL-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-DL-NEXT:    v_perm_b32 v1, v1, v1, s1
+; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-DL-NEXT:    v_perm_b32 v2, v2, v2, s1
+; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT:    v_dot4_i32_i8 v1, v2, v1, s0
+; GFX9-DL-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX9-DL-NEXT:    s_endpgm
+;
+; GFX10-DL-LABEL: idot4_acc32_3ele:
+; GFX10-DL:       ; %bb.0: ; %entry
+; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-DL-NEXT:    v_mov_b32_e32 v3, 0
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_clause 0x1
+; GFX10-DL-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX10-DL-NEXT:    global_load_dword v2, v0, s[4:5]
+; GFX10-DL-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-DL-NEXT:    v_perm_b32 v0, v1, v1, 0xc020100
+; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DL-NEXT:    v_perm_b32 v1, v2, v2, 0xc020100
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-DL-NEXT:    v_dot4c_i32_i8_e32 v2, v1, v0
+; GFX10-DL-NEXT:    global_store_dword v3, v2, s[0:1]
+; GFX10-DL-NEXT:    s_endpgm
+;
+; GFX11-DL-LABEL: idot4_acc32_3ele:
+; GFX11-DL:       ; %bb.0: ; %entry
+; GFX11-DL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-DL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-DL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_clause 0x1
+; GFX11-DL-NEXT:    global_load_b32 v1, v0, s[6:7]
+; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[4:5]
+; GFX11-DL-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-DL-NEXT:    v_perm_b32 v1, v1, v1, 0xc020100
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DL-NEXT:    v_perm_b32 v0, v0, v0, 0xc020100
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DL-NEXT:    v_dot4_i32_iu8 v0, v0, v1, s2 neg_lo:[1,1,0]
+; GFX11-DL-NEXT:    global_store_b32 v2, v0, s[0:1]
+; GFX11-DL-NEXT:    s_nop 0
+; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DL-NEXT:    s_endpgm
+                                       ptr addrspace(1) %src2,
+                                       ptr addrspace(1) nocapture %dst) {
+entry:
+  %idx = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
+  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
+  %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
+  %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
+
+  %v1e0 = extractelement <4 x i8> %vec1, i64 0
+  %cv1e0 = sext i8 %v1e0 to i32
+  %v2e0 = extractelement <4 x i8> %vec2, i64 0
+  %cv2e0 = sext i8 %v2e0 to i32
+  %mul1 = mul nuw nsw i32 %cv1e0, %cv2e0
+
+  %v1e1 = extractelement <4 x i8> %vec1, i64 1
+  %cv1e1 = sext i8 %v1e1 to i32
+  %v2e1 = extractelement <4 x i8> %vec2, i64 1
+  %cv2e1 = sext i8 %v2e1 to i32
+  %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1
+
+  %v1e2 = extractelement <4 x i8> %vec1, i64 2
+  %cv1e2 = sext i8 %v1e2 to i32
+  %v2e2 = extractelement <4 x i8> %vec2, i64 2
+  %cv2e2 = sext i8 %v2e2 to i32
+  %mul3 = mul nuw nsw i32 %cv1e2, %cv2e2
+
+  %acc = load i32, ptr addrspace(1) %dst, align 4
+  %add1 = add i32 %mul1, %acc
+  %add2 = add i32 %add1, %mul2
+  %add3 = add i32 %add2, %mul3
+  store i32 %add3, ptr addrspace(1) %dst, align 4
+  ret void
+}
+
+
+define amdgpu_kernel void @idot4_acc32_3ele_permuted(ptr addrspace(1) %src1,
+; GFX7-LABEL: idot4_acc32_3ele_permuted:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    s_mov_b32 s10, 0
+; GFX7-NEXT:    s_mov_b32 s11, s3
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX7-NEXT:    v_mov_b32_e32 v1, 0
+; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_ashrrev_i32_e32 v1, 24, v2
+; GFX7-NEXT:    v_bfe_i32 v3, v2, 0, 8
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_ashrrev_i32_e32 v4, 24, v0
+; GFX7-NEXT:    v_bfe_i32 v5, v0, 0, 8
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mad_i32_i24 v1, v1, v4, s4
+; GFX7-NEXT:    v_bfe_i32 v2, v2, 16, 8
+; GFX7-NEXT:    v_bfe_i32 v0, v0, 16, 8
+; GFX7-NEXT:    v_mad_i32_i24 v1, v3, v5, v1
+; GFX7-NEXT:    v_mad_i32_i24 v0, v2, v0, v1
+; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: idot4_acc32_3ele_permuted:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v0, v[0:1]
+; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX8-NEXT:    s_waitcnt vmcnt(1)
+; GFX8-NEXT:    v_ashrrev_i32_e32 v1, 24, v3
+; GFX8-NEXT:    v_bfe_i32 v4, v3, 0, 8
+; GFX8-NEXT:    v_bfe_i32 v3, v3, 16, 8
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_ashrrev_i32_e32 v2, 24, v0
+; GFX8-NEXT:    v_bfe_i32 v5, v0, 0, 8
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mad_i32_i24 v1, v1, v2, s2
+; GFX8-NEXT:    v_bfe_i32 v0, v0, 16, 8
+; GFX8-NEXT:    v_mad_i32_i24 v1, v4, v5, v1
+; GFX8-NEXT:    v_mad_i32_i24 v2, v3, v0, v1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    flat_store_dword v[0:1], v2
+; GFX8-NEXT:    s_endpgm
+;
+; GFX9-NODL-LABEL: idot4_acc32_3ele_permuted:
+; GFX9-NODL:       ; %bb.0: ; %entry
+; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NODL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[4:5]
+; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[6:7]
+; GFX9-NODL-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NODL-NEXT:    v_ashrrev_i32_e32 v3, 24, v1
+; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NODL-NEXT:    v_ashrrev_i32_e32 v4, 24, v2
+; GFX9-NODL-NEXT:    v_mul_i32_i24_sdwa v5, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX9-NODL-NEXT:    v_mul_i32_i24_sdwa v1, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
+; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NODL-NEXT:    v_mad_i32_i24 v2, v3, v4, s0
+; GFX9-NODL-NEXT:    v_add3_u32 v1, v2, v5, v1
+; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX9-NODL-NEXT:    s_endpgm
+;
+; GFX9-DL-LABEL: idot4_acc32_3ele_permuted:
+; GFX9-DL:       ; %bb.0: ; %entry
+; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-DL-NEXT:    s_mov_b32 s1, 0xc020003
+; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-DL-NEXT:    global_load_dword v2, v0, s[4:5]
+; GFX9-DL-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-DL-NEXT:    v_perm_b32 v1, v1, v1, s1
+; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-DL-NEXT:    v_perm_b32 v2, v2, v2, s1
+; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT:    v_dot4_i32_i8 v1, v2, v1, s0
+; GFX9-DL-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX9-DL-NEXT:    s_endpgm
+;
+; GFX10-DL-LABEL: idot4_acc32_3ele_permuted:
+; GFX10-DL:       ; %bb.0: ; %entry
+; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-DL-NEXT:    v_mov_b32_e32 v3, 0
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_clause 0x1
+; GFX10-DL-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX10-DL-NEXT:    global_load_dword v2, v0, s[4:5]
+; GFX10-DL-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-DL-NEXT:    v_perm_b32 v0, v1, v1, 0xc020003
+; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DL-NEXT:    v_perm_b32 v1, v2, v2, 0xc020003
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-DL-NEXT:    v_dot4c_i32_i8_e32 v2, v1, v0
+; GFX10-DL-NEXT:    global_store_dword v3, v2, s[0:1]
+; GFX10-DL-NEXT:    s_endpgm
+;
+; GFX11-DL-LABEL: idot4_acc32_3ele_permuted:
+; GFX11-DL:       ; %bb.0: ; %entry
+; GFX11-DL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-DL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-DL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_clause 0x1
+; GFX11-DL-NEXT:    global_load_b32 v1, v0, s[6:7]
+; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[4:5]
+; GFX11-DL-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-DL-NEXT:    v_perm_b32 v1, v1, v1, 0xc020003
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DL-NEXT:    v_perm_b32 v0, v0, v0, 0xc020003
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DL-NEXT:    v_dot4_i32_iu8 v0, v0, v1, s2 neg_lo:[1,1,0]
+; GFX11-DL-NEXT:    global_store_b32 v2, v0, s[0:1]
+; GFX11-DL-NEXT:    s_nop 0
+; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DL-NEXT:    s_endpgm
+                                       ptr addrspace(1) %src2,
+                                       ptr addrspace(1) nocapture %dst) {
+entry:
+  %idx = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
+  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
+  %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
+  %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
+
+  %v1e0 = extractelement <4 x i8> %vec1, i64 3
+  %cv1e0 = sext i8 %v1e0 to i32
+  %v2e0 = extractelement <4 x i8> %vec2, i64 3
+  %cv2e0 = sext i8 %v2e0 to i32
+  %mul1 = mul nuw nsw i32 %cv1e0, %cv2e0
+
+  %v1e1 = extractelement <4 x i8> %vec1, i64 0
+  %cv1e1 = sext i8 %v1e1 to i32
+  %v2e1 = extractelement <4 x i8> %vec2, i64 0
+  %cv2e1 = sext i8 %v2e1 to i32
+  %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1
+
+  %v1e2 = extractelement <4 x i8> %vec1, i64 2
+  %cv1e2 = sext i8 %v1e2 to i32
+  %v2e2 = extractelement <4 x i8> %vec2, i64 2
+  %cv2e2 = sext i8 %v2e2 to i32
+  %mul3 = mul nuw nsw i32 %cv1e2, %cv2e2
+
+  %acc = load i32, ptr addrspace(1) %dst, align 4
+  %add1 = add i32 %mul1, %acc
+  %add2 = add i32 %add1, %mul2
+  %add3 = add i32 %add2, %mul3
+  store i32 %add3, ptr addrspace(1) %dst, align 4
+  ret void
+}
+
+define amdgpu_kernel void @idot4_acc32_opt(ptr addrspace(1) %src1,
+; GFX7-LABEL: idot4_acc32_opt:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    s_mov_b32 s10, 0
+; GFX7-NEXT:    s_mov_b32 s11, s3
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX7-NEXT:    v_mov_b32_e32 v1, 0
+; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_bfe_i32 v3, v2, 8, 8
+; GFX7-NEXT:    v_bfe_i32 v1, v2, 0, 8
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_bfe_i32 v6, v0, 8, 8
+; GFX7-NEXT:    v_bfe_i32 v5, v0, 0, 8
+; GFX7-NEXT:    v_mul_i32_i24_e32 v3, v3, v6
+; GFX7-NEXT:    v_bfe_i32 v4, v2, 16, 8
+; GFX7-NEXT:    v_bfe_i32 v7, v0, 16, 8
+; GFX7-NEXT:    v_mad_i32_i24 v1, v1, v5, v3
+; GFX7-NEXT:    v_ashrrev_i32_e32 v2, 24, v2
+; GFX7-NEXT:    v_ashrrev_i32_e32 v0, 24, v0
+; GFX7-NEXT:    v_mad_i32_i24 v1, v4, v7, v1
+; GFX7-NEXT:    v_mad_i32_i24 v0, v2, v0, v1
+; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: idot4_acc32_opt:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v2, v[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    s_waitcnt vmcnt(1)
+; GFX8-NEXT:    v_bfe_i32 v4, v3, 0, 8
+; GFX8-NEXT:    v_bfe_i32 v7, v3, 16, 8
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_bfe_i32 v5, v2, 0, 8
+; GFX8-NEXT:    v_mul_i32_i24_sdwa v6, sext(v3), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
+; GFX8-NEXT:    v_bfe_i32 v8, v2, 16, 8
+; GFX8-NEXT:    v_mad_i32_i24 v4, v4, v5, v6
+; GFX8-NEXT:    v_ashrrev_i32_e32 v3, 24, v3
+; GFX8-NEXT:    v_ashrrev_i32_e32 v2, 24, v2
+; GFX8-NEXT:    v_mad_i32_i24 v4, v7, v8, v4
+; GFX8-NEXT:    v_mad_i32_i24 v2, v3, v2, v4
+; GFX8-NEXT:    flat_store_dword v[0:1], v2
+; GFX8-NEXT:    s_endpgm
+;
+; GFX9-NODL-LABEL: idot4_acc32_opt:
+; GFX9-NODL:       ; %bb.0: ; %entry
+; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NODL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[4:5]
+; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[6:7]
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NODL-NEXT:    v_bfe_i32 v3, v1, 0, 8
+; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NODL-NEXT:    v_bfe_i32 v4, v2, 0, 8
+; GFX9-NODL-NEXT:    v_mul_i32_i24_sdwa v5, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
+; GFX9-NODL-NEXT:    v_mul_i32_i24_sdwa v6, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
+; GFX9-NODL-NEXT:    v_mul_i32_i24_sdwa v1, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
+; GFX9-NODL-NEXT:    v_mad_i32_i24 v2, v3, v4, v5
+; GFX9-NODL-NEXT:    v_add3_u32 v1, v2, v6, v1
+; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX9-NODL-NEXT:    s_endpgm
+;
+; GFX9-DL-LABEL: idot4_acc32_opt:
+; GFX9-DL:       ; %bb.0: ; %entry
+; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
+; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-DL-NEXT:    v_dot4_i32_i8 v1, v1, v2, 0
+; GFX9-DL-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX9-DL-NEXT:    s_endpgm
+;
+; GFX10-DL-LABEL: idot4_acc32_opt:
+; GFX10-DL:       ; %bb.0: ; %entry
+; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-DL-NEXT:    v_mov_b32_e32 v3, 0
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_clause 0x1
+; GFX10-DL-NEXT:    global_load_dword v1, v0, s[4:5]
+; GFX10-DL-NEXT:    global_load_dword v2, v0, s[6:7]
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DL-NEXT:    v_dot4c_i32_i8_e32 v0, v1, v2
+; GFX10-DL-NEXT:    global_store_dword v3, v0, s[0:1]
+; GFX10-DL-NEXT:    s_endpgm
+;
+; GFX11-DL-LABEL: idot4_acc32_opt:
+; GFX11-DL:       ; %bb.0: ; %entry
+; GFX11-DL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-DL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-DL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_clause 0x1
+; GFX11-DL-NEXT:    global_load_b32 v1, v0, s[4:5]
+; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[6:7]
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DL-NEXT:    v_dot4_i32_iu8 v0, v1, v0, 0 neg_lo:[1,1,0]
+; GFX11-DL-NEXT:    global_store_b32 v2, v0, s[0:1]
+; GFX11-DL-NEXT:    s_nop 0
+; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DL-NEXT:    s_endpgm
+                                       ptr addrspace(1) %src2,
+                                       ptr addrspace(1) nocapture %dst) {
+entry:
+  %idx = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
+  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
+  %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
+  %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
+
+  %v1e0 = extractelement <4 x i8> %vec1, i64 0
+  %cv1e0 = sext i8 %v1e0 to i32
+  %v2e0 = extractelement <4 x i8> %vec2, i64 0
+  %cv2e0 = sext i8 %v2e0 to i32
+  %mul1 = mul nuw nsw i32 %cv1e0, %cv2e0
+
+  %v1e1 = extractelement <4 x i8> %vec1, i64 1
+  %cv1e1 = sext i8 %v1e1 to i32
+  %v2e1 = extractelement <4 x i8> %vec2, i64 1
+  %cv2e1 = sext i8 %v2e1 to i32
+  %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1
+
+  %v1e2 = extractelement <4 x i8> %vec1, i64 2
+  %cv1e2 = sext i8 %v1e2 to i32
+  %v2e2 = extractelement <4 x i8> %vec2, i64 2
+  %cv2e2 = sext i8 %v2e2 to i32
+  %mul3 = mul nuw nsw i32 %cv1e2, %cv2e2
+
+  %v1e3 = extractelement <4 x i8> %vec1, i64 3
+  %cv1e3 = sext i8 %v1e3 to i32
+  %v2e3 = extractelement <4 x i8> %vec2, i64 3
+  %cv2e3 = sext i8 %v2e3 to i32
+  %mul4 = mul nuw nsw i32 %cv1e3, %cv2e3
+
+  %add2 = add i32 %mul1, %mul2
+  %add3 = add i32 %add2, %mul3
+  %add4 = add i32 %add3, %mul4
+  store i32 %add4, ptr addrspace(1) %dst, align 4
+  ret void
+}
+
+define amdgpu_kernel void @idot4_acc32_3src(ptr addrspace(1) %src1,
+; GFX7-LABEL: idot4_acc32_3src:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; GFX7-NEXT:    s_mov_b32 s11, 0xf000
+; GFX7-NEXT:    s_mov_b32 s14, 0
+; GFX7-NEXT:    s_mov_b32 s15, s11
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX7-NEXT:    v_mov_b32_e32 v1, 0
+; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[12:15], 0 addr64
+; GFX7-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64
+; GFX7-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[12:15], 0 addr64
+; GFX7-NEXT:    s_load_dword s0, s[6:7], 0x0
+; GFX7-NEXT:    s_mov_b32 s10, -1
+; GFX7-NEXT:    s_mov_b32 s8, s6
+; GFX7-NEXT:    s_mov_b32 s9, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(2)
+; GFX7-NEXT:    v_bfe_i32 v1, v2, 0, 8
+; GFX7-NEXT:    v_bfe_i32 v4, v2, 8, 8
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_bfe_i32 v3, v3, 8, 8
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mad_i32_i24 v1, v1, v1, s0
+; GFX7-NEXT:    v_bfe_i32 v5, v2, 16, 8
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_bfe_i32 v6, v0, 16, 8
+; GFX7-NEXT:    v_mad_i32_i24 v1, v4, v3, v1
+; GFX7-NEXT:    v_ashrrev_i32_e32 v2, 24, v2
+; GFX7-NEXT:    v_ashrrev_i32_e32 v0, 24, v0
+; GFX7-NEXT:    v_mad_i32_i24 v1, v5, v6, v1
+; GFX7-NEXT:    v_mad_i32_i24 v0, v2, v0, v1
+; GFX7-NEXT:    buffer_store_dword v0, off, s[8:11], 0
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: idot4_acc32_3src:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v1, s3
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v0, v[0:1]
+; GFX8-NEXT:    s_load_dword s0, s[6:7], 0x0
+; GFX8-NEXT:    s_waitcnt vmcnt(2)
+; GFX8-NEXT:    v_bfe_i32 v1, v3, 0, 8
+; GFX8-NEXT:    v_bfe_i32 v2, v3, 8, 8
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mad_i32_i24 v1, v1, v1, s0
+; GFX8-NEXT:    v_bfe_i32 v5, v3, 16, 8
+; GFX8-NEXT:    v_ashrrev_i32_e32 v3, 24, v3
+; GFX8-NEXT:    s_waitcnt vmcnt(1)
+; GFX8-NEXT:    v_bfe_i32 v4, v4, 8, 8
+; GFX8-NEXT:    v_mad_i32_i24 v1, v2, v4, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_bfe_i32 v6, v0, 16, 8
+; GFX8-NEXT:    v_ashrrev_i32_e32 v0, 24, v0
+; GFX8-NEXT:    v_mad_i32_i24 v1, v5, v6, v1
+; GFX8-NEXT:    v_mad_i32_i24 v2, v3, v0, v1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    flat_store_dword v[0:1], v2
+; GFX8-NEXT:    s_endpgm
+;
+; GFX9-NODL-LABEL: idot4_acc32_3src:
+; GFX9-NODL:       ; %bb.0: ; %entry
+; GFX9-NODL-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[2:3]
+; GFX9-NODL-NEXT:    global_load_dword v3, v0, s[4:5]
+; GFX9-NODL-NEXT:    s_load_dword s0, s[6:7], 0x0
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NODL-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NODL-NEXT:    v_mul_i32_i24_sdwa v4, sext(v1), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NODL-NEXT:    v_mul_i32_i24_sdwa v2, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
+; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NODL-NEXT:    v_mul_i32_i24_sdwa v5, sext(v1), sext(v3) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
+; GFX9-NODL-NEXT:    v_mul_i32_i24_sdwa v1, sext(v1), sext(v3) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
+; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NODL-NEXT:    v_add3_u32 v2, v4, s0, v2
+; GFX9-NODL-NEXT:    v_add3_u32 v1, v2, v5, v1
+; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX9-NODL-NEXT:    s_endpgm
+;
+; GFX9-DL-LABEL: idot4_acc32_3src:
+; GFX9-DL:       ; %bb.0: ; %entry
+; GFX9-DL-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-DL-NEXT:    global_load_dword v2, v0, s[4:5]
+; GFX9-DL-NEXT:    global_load_dword v3, v0, s[0:1]
+; GFX9-DL-NEXT:    s_load_dword s1, s[6:7], 0x0
+; GFX9-DL-NEXT:    s_mov_b32 s0, 0x706010c
+; GFX9-DL-NEXT:    s_mov_b32 s2, 0xc0c0c00
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-DL-NEXT:    v_perm_b32 v1, v2, v1, s0
+; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-DL-NEXT:    v_perm_b32 v2, v3, v3, s2
+; GFX9-DL-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT:    v_dot4_i32_i8 v1, v3, v1, s1
+; GFX9-DL-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX9-DL-NEXT:    s_endpgm
+;
+; GFX10-DL-LABEL: idot4_acc32_3src:
+; GFX10-DL:       ; %bb.0: ; %entry
+; GFX10-DL-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_clause 0x2
+; GFX10-DL-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX10-DL-NEXT:    global_load_dword v2, v0, s[4:5]
+; GFX10-DL-NEXT:    global_load_dword v3, v0, s[0:1]
+; GFX10-DL-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-DL-NEXT:    s_load_dword s0, s[6:7], 0x0
+; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-DL-NEXT:    v_perm_b32 v0, v2, v1, 0x706010c
+; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DL-NEXT:    v_perm_b32 v1, v3, v3, 0xc0c0c00
+; GFX10-DL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-DL-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s0
+; GFX10-DL-NEXT:    v_dot4c_i32_i8_e32 v1, v3, v0
+; GFX10-DL-NEXT:    global_store_dword v2, v1, s[6:7]
+; GFX10-DL-NEXT:    s_endpgm
+;
+; GFX11-DL-LABEL: idot4_acc32_3src:
+; GFX11-DL:       ; %bb.0: ; %entry
+; GFX11-DL-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_clause 0x2
+; GFX11-DL-NEXT:    global_load_b32 v1, v0, s[2:3]
+; GFX11-DL-NEXT:    global_load_b32 v2, v0, s[4:5]
+; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[0:1]
+; GFX11-DL-NEXT:    s_load_b32 s0, s[6:7], 0x0
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-DL-NEXT:    v_perm_b32 v1, v2, v1, 0x706010c
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DL-NEXT:    v_perm_b32 v2, v0, v0, 0xc0c0c00
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-DL-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX11-DL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    v_dot4_i32_iu8 v0, v0, v1, s0 neg_lo:[1,1,0]
+; GFX11-DL-NEXT:    global_store_b32 v2, v0, s[6:7]
+; GFX11-DL-NEXT:    s_nop 0
+; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DL-NEXT:    s_endpgm
+                                       ptr addrspace(1) %src2,
+                                       ptr addrspace(1) %src3,
+                                       ptr addrspace(1) nocapture %dst) {
+entry:
+  %idx = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
+  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
+  %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
+  %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
+  %gep3 = getelementptr <4 x i8>, ptr addrspace(1) %src3, i32 %idx
+  %vec3 = load <4 x i8>, ptr addrspace(1) %gep3
+
+  %v1e0 = extractelement <4 x i8> %vec1, i64 0
+  %cv1e0 = sext i8 %v1e0 to i32
+  %mul1 = mul nuw nsw i32 %cv1e0, %cv1e0
+
+  %v1e1 = extractelement <4 x i8> %vec1, i64 1
+  %cv1e1 = sext i8 %v1e1 to i32
+  %v2e1 = extractelement <4 x i8> %vec2, i64 1
+  %cv2e1 = sext i8 %v2e1 to i32
+  %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1
+
+  %v1e2 = extractelement <4 x i8> %vec1, i64 2
+  %cv1e2 = sext i8 %v1e2 to i32
+  %v3e2 = extractelement <4 x i8> %vec3, i64 2
+  %cv3e2 = sext i8 %v3e2 to i32
+  %mul3 = mul nuw nsw i32 %cv1e2, %cv3e2
+
+  %v1e3 = extractelement <4 x i8> %vec1, i64 3
+  %cv1e3 = sext i8 %v1e3 to i32
+  %v3e3 = extractelement <4 x i8> %vec3, i64 3
+  %cv3e3 = sext i8 %v3e3 to i32
+  %mul4 = mul nuw nsw i32 %cv1e3, %cv3e3
+
+  %acc = load i32, ptr addrspace(1) %dst, align 4
+  %add1 = add i32 %mul1, %acc
+  %add2 = add i32 %add1, %mul2
+  %add3 = add i32 %add2, %mul3
+  %add4 = add i32 %add3, %mul4
+  store i32 %add4, ptr addrspace(1) %dst, align 4
+  ret void
+}
+
+define amdgpu_kernel void @idot4_acc32_3src_3ele(ptr addrspace(1) %src1,
+; GFX7-LABEL: idot4_acc32_3src_3ele:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; GFX7-NEXT:    s_mov_b32 s11, 0xf000
+; GFX7-NEXT:    s_mov_b32 s14, 0
+; GFX7-NEXT:    s_mov_b32 s15, s11
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX7-NEXT:    v_mov_b32_e32 v1, 0
+; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[12:15], 0 addr64
+; GFX7-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64
+; GFX7-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[12:15], 0 addr64
+; GFX7-NEXT:    s_load_dword s0, s[6:7], 0x0
+; GFX7-NEXT:    s_mov_b32 s10, -1
+; GFX7-NEXT:    s_mov_b32 s8, s6
+; GFX7-NEXT:    s_mov_b32 s9, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(2)
+; GFX7-NEXT:    v_bfe_i32 v1, v2, 0, 8
+; GFX7-NEXT:    v_bfe_i32 v4, v2, 8, 8
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_bfe_i32 v3, v3, 8, 8
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mad_i32_i24 v1, v1, v1, s0
+; GFX7-NEXT:    v_bfe_i32 v2, v2, 16, 8
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_bfe_i32 v0, v0, 16, 8
+; GFX7-NEXT:    v_mad_i32_i24 v1, v4, v3, v1
+; GFX7-NEXT:    v_mad_i32_i24 v0, v2, v0, v1
+; GFX7-NEXT:    buffer_store_dword v0, off, s[8:11], 0
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: idot4_acc32_3src_3ele:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v1, s3
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v0, v[0:1]
+; GFX8-NEXT:    s_load_dword s0, s[6:7], 0x0
+; GFX8-NEXT:    s_waitcnt vmcnt(2)
+; GFX8-NEXT:    v_bfe_i32 v1, v3, 0, 8
+; GFX8-NEXT:    v_bfe_i32 v2, v3, 8, 8
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mad_i32_i24 v1, v1, v1, s0
+; GFX8-NEXT:    v_bfe_i32 v3, v3, 16, 8
+; GFX8-NEXT:    s_waitcnt vmcnt(1)
+; GFX8-NEXT:    v_bfe_i32 v4, v4, 8, 8
+; GFX8-NEXT:    v_mad_i32_i24 v1, v2, v4, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_bfe_i32 v0, v0, 16, 8
+; GFX8-NEXT:    v_mad_i32_i24 v2, v3, v0, v1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    flat_store_dword v[0:1], v2
+; GFX8-NEXT:    s_endpgm
+;
+; GFX9-NODL-LABEL: idot4_acc32_3src_3ele:
+; GFX9-NODL:       ; %bb.0: ; %entry
+; GFX9-NODL-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[2:3]
+; GFX9-NODL-NEXT:    global_load_dword v3, v0, s[4:5]
+; GFX9-NODL-NEXT:    s_load_dword s0, s[6:7], 0x0
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NODL-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NODL-NEXT:    v_bfe_i32 v4, v1, 0, 8
+; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NODL-NEXT:    v_mul_i32_i24_sdwa v2, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
+; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NODL-NEXT:    v_mul_i32_i24_sdwa v1, sext(v1), sext(v3) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
+; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NODL-NEXT:    v_mad_i32_i24 v3, v4, v4, s0
+; GFX9-NODL-NEXT:    v_add3_u32 v1, v3, v2, v1
+; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX9-NODL-NEXT:    s_endpgm
+;
+; GFX9-DL-LABEL: idot4_acc32_3src_3ele:
+; GFX9-DL:       ; %bb.0: ; %entry
+; GFX9-DL-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-DL-NEXT:    global_load_dword v2, v0, s[4:5]
+; GFX9-DL-NEXT:    global_load_dword v3, v0, s[0:1]
+; GFX9-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
+; GFX9-DL-NEXT:    s_mov_b32 s0, 0xc06010c
+; GFX9-DL-NEXT:    s_mov_b32 s1, 0xc0c0c00
+; GFX9-DL-NEXT:    s_mov_b32 s2, 0xc020100
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-DL-NEXT:    v_perm_b32 v1, v2, v1, s0
+; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-DL-NEXT:    v_perm_b32 v2, v3, v3, s1
+; GFX9-DL-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX9-DL-NEXT:    v_perm_b32 v2, v3, v3, s2
+; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT:    v_dot4_i32_i8 v1, v2, v1, s3
+; GFX9-DL-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX9-DL-NEXT:    s_endpgm
+;
+; GFX10-DL-LABEL: idot4_acc32_3src_3ele:
+; GFX10-DL:       ; %bb.0: ; %entry
+; GFX10-DL-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_clause 0x2
+; GFX10-DL-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX10-DL-NEXT:    global_load_dword v2, v0, s[4:5]
+; GFX10-DL-NEXT:    global_load_dword v3, v0, s[0:1]
+; GFX10-DL-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-DL-NEXT:    s_load_dword s0, s[6:7], 0x0
+; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-DL-NEXT:    v_perm_b32 v0, v2, v1, 0xc06010c
+; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DL-NEXT:    v_perm_b32 v1, v3, v3, 0xc0c0c00
+; GFX10-DL-NEXT:    v_perm_b32 v2, v3, v3, 0xc020100
+; GFX10-DL-NEXT:    v_mov_b32_e32 v3, 0
+; GFX10-DL-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s0
+; GFX10-DL-NEXT:    v_dot4c_i32_i8_e32 v1, v2, v0
+; GFX10-DL-NEXT:    global_store_dword v3, v1, s[6:7]
+; GFX10-DL-NEXT:    s_endpgm
+;
+; GFX11-DL-LABEL: idot4_acc32_3src_3ele:
+; GFX11-DL:       ; %bb.0: ; %entry
+; GFX11-DL-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_clause 0x2
+; GFX11-DL-NEXT:    global_load_b32 v1, v0, s[2:3]
+; GFX11-DL-NEXT:    global_load_b32 v2, v0, s[4:5]
+; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[0:1]
+; GFX11-DL-NEXT:    s_load_b32 s0, s[6:7], 0x0
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-DL-NEXT:    v_perm_b32 v1, v2, v1, 0xc06010c
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DL-NEXT:    v_perm_b32 v2, v0, v0, 0xc0c0c00
+; GFX11-DL-NEXT:    v_perm_b32 v0, v0, v0, 0xc020100
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-DL-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX11-DL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    v_dot4_i32_iu8 v0, v0, v1, s0 neg_lo:[1,1,0]
+; GFX11-DL-NEXT:    global_store_b32 v2, v0, s[6:7]
+; GFX11-DL-NEXT:    s_nop 0
+; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DL-NEXT:    s_endpgm
+                                       ptr addrspace(1) %src2,
+                                       ptr addrspace(1) %src3,
+                                       ptr addrspace(1) nocapture %dst) {
+entry:
+  %idx = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
+  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
+  %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
+  %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
+  %gep3 = getelementptr <4 x i8>, ptr addrspace(1) %src3, i32 %idx
+  %vec3 = load <4 x i8>, ptr addrspace(1) %gep3
+
+  %v1e0 = extractelement <4 x i8> %vec1, i64 0
+  %cv1e0 = sext i8 %v1e0 to i32
+  %mul1 = mul nuw nsw i32 %cv1e0, %cv1e0
+
+  %v1e1 = extractelement <4 x i8> %vec1, i64 1
+  %cv1e1 = sext i8 %v1e1 to i32
+  %v2e1 = extractelement <4 x i8> %vec2, i64 1
+  %cv2e1 = sext i8 %v2e1 to i32
+  %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1
+
+  %v1e2 = extractelement <4 x i8> %vec1, i64 2
+  %cv1e2 = sext i8 %v1e2 to i32
+  %v3e2 = extractelement <4 x i8> %vec3, i64 2
+  %cv3e2 = sext i8 %v3e2 to i32
+  %mul3 = mul nuw nsw i32 %cv1e2, %cv3e2
+
+  %acc = load i32, ptr addrspace(1) %dst, align 4
+  %add1 = add i32 %mul1, %acc
+  %add2 = add i32 %add1, %mul2
+  %add3 = add i32 %add2, %mul3
+  store i32 %add3, ptr addrspace(1) %dst, align 4
+  ret void
+}
+
+define amdgpu_kernel void @idot4_bad_source(ptr addrspace(1) %src1,
+; GFX7-LABEL: idot4_bad_source:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GFX7-NEXT:    s_load_dword s12, s[0:1], 0xf
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    s_mov_b32 s10, 0
+; GFX7-NEXT:    s_mov_b32 s11, s3
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX7-NEXT:    v_mov_b32_e32 v1, 0
+; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x11
+; GFX7-NEXT:    s_sext_i32_i16 s5, s12
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_bfe_i32 v3, v2, 0, 8
+; GFX7-NEXT:    v_bfe_i32 v4, v2, 8, 8
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_bfe_i32 v5, v0, 8, 8
+; GFX7-NEXT:    v_mad_i32_i24 v1, v3, s5, v1
+; GFX7-NEXT:    v_bfe_i32 v2, v2, 16, 8
+; GFX7-NEXT:    v_bfe_i32 v0, v0, 16, 8
+; GFX7-NEXT:    v_mad_i32_i24 v1, v4, v5, v1
+; GFX7-NEXT:    v_mad_i32_i24 v0, v2, v0, v1
+; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: idot4_bad_source:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x3c
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x44
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v0, v[0:1]
+; GFX8-NEXT:    s_load_dword s3, s[0:1], 0x0
+; GFX8-NEXT:    s_sext_i32_i16 s2, s2
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v1, s3
+; GFX8-NEXT:    s_waitcnt vmcnt(1)
+; GFX8-NEXT:    v_bfe_i32 v2, v3, 0, 8
+; GFX8-NEXT:    v_bfe_i32 v4, v3, 8, 8
+; GFX8-NEXT:    v_mad_i32_i24 v1, v2, s2, v1
+; GFX8-NEXT:    v_bfe_i32 v3, v3, 16, 8
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_bfe_i32 v5, v0, 8, 8
+; GFX8-NEXT:    v_bfe_i32 v0, v0, 16, 8
+; GFX8-NEXT:    v_mad_i32_i24 v1, v4, v5, v1
+; GFX8-NEXT:    v_mad_i32_i24 v2, v3, v0, v1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    flat_store_dword v[0:1], v2
+; GFX8-NEXT:    s_endpgm
+;
+; GFX9-NODL-LABEL: idot4_bad_source:
+; GFX9-NODL:       ; %bb.0: ; %entry
+; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NODL-NEXT:    s_load_dword s2, s[0:1], 0x3c
+; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NODL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x44
+; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[4:5]
+; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[6:7]
+; GFX9-NODL-NEXT:    s_sext_i32_i16 s2, s2
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NODL-NEXT:    s_load_dword s3, s[0:1], 0x0
+; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NODL-NEXT:    v_bfe_i32 v3, v1, 0, 8
+; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NODL-NEXT:    v_mul_i32_i24_sdwa v4, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
+; GFX9-NODL-NEXT:    v_mul_i32_i24_sdwa v1, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
+; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s3
+; GFX9-NODL-NEXT:    v_mad_i32_i24 v2, v3, s2, v2
+; GFX9-NODL-NEXT:    v_add3_u32 v1, v2, v4, v1
+; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NODL-NEXT:    s_endpgm
+;
+; GFX9-DL-LABEL: idot4_bad_source:
+; GFX9-DL:       ; %bb.0: ; %entry
+; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-DL-NEXT:    s_load_dword s2, s[0:1], 0x3c
+; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x44
+; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
+; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
+; GFX9-DL-NEXT:    s_mov_b32 s4, 0xc0c0201
+; GFX9-DL-NEXT:    s_sext_i32_i16 s2, s2
+; GFX9-DL-NEXT:    s_load_dword s3, s[0:1], 0x0
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s3
+; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-DL-NEXT:    v_bfe_i32 v4, v1, 0, 8
+; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-DL-NEXT:    v_perm_b32 v2, v2, v2, s4
+; GFX9-DL-NEXT:    v_mad_i32_i24 v3, v4, s2, v3
+; GFX9-DL-NEXT:    v_perm_b32 v1, v1, v1, s4
+; GFX9-DL-NEXT:    v_dot4_i32_i8 v1, v1, v2, v3
+; GFX9-DL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-DL-NEXT:    s_endpgm
+;
+; GFX10-DL-LABEL: idot4_bad_source:
+; GFX10-DL:       ; %bb.0: ; %entry
+; GFX10-DL-NEXT:    s_clause 0x1
+; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT:    s_load_dword s2, s[0:1], 0x3c
+; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x44
+; GFX10-DL-NEXT:    v_mov_b32_e32 v3, 0
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_clause 0x1
+; GFX10-DL-NEXT:    global_load_dword v1, v0, s[4:5]
+; GFX10-DL-NEXT:    global_load_dword v2, v0, s[6:7]
+; GFX10-DL-NEXT:    s_sext_i32_i16 s2, s2
+; GFX10-DL-NEXT:    s_load_dword s3, s[0:1], 0x0
+; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-DL-NEXT:    v_bfe_i32 v0, v1, 0, 8
+; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DL-NEXT:    v_perm_b32 v2, v2, v2, 0xc0c0201
+; GFX10-DL-NEXT:    v_perm_b32 v1, v1, v1, 0xc0c0201
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    v_mad_i32_i24 v0, v0, s2, s3
+; GFX10-DL-NEXT:    v_dot4c_i32_i8_e32 v0, v1, v2
+; GFX10-DL-NEXT:    global_store_dword v3, v0, s[0:1]
+; GFX10-DL-NEXT:    s_endpgm
+;
+; GFX11-DL-LABEL: idot4_bad_source:
+; GFX11-DL:       ; %bb.0: ; %entry
+; GFX11-DL-NEXT:    s_clause 0x1
+; GFX11-DL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT:    s_load_b32 s2, s[0:1], 0x3c
+; GFX11-DL-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_lshlrev_b32 v0, 2, v0
+; GFX11-DL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x44
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_clause 0x1
+; GFX11-DL-NEXT:    global_load_b32 v1, v0, s[4:5]
+; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[6:7]
+; GFX11-DL-NEXT:    s_sext_i32_i16 s2, s2
+; GFX11-DL-NEXT:    s_load_b32 s3, s[0:1], 0x0
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-DL-NEXT:    v_bfe_i32 v2, v1, 0, 8
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DL-NEXT:    v_perm_b32 v0, v0, v0, 0xc0c0201
+; GFX11-DL-NEXT:    v_perm_b32 v1, v1, v1, 0xc0c0201
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    v_mad_i32_i24 v2, v2, s2, s3
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DL-NEXT:    v_dot4_i32_iu8 v0, v1, v0, v2 neg_lo:[1,1,0]
+; GFX11-DL-NEXT:    global_store_b32 v3, v0, s[0:1]
+; GFX11-DL-NEXT:    s_nop 0
+; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DL-NEXT:    s_endpgm
+                                       ptr addrspace(1) %src2,
+                                       ptr addrspace(1) %src3,
+                                       i16 %badsource,
+                                       ptr addrspace(1) nocapture %dst) {
+entry:
+  %idx = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
+  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
+  %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
+  %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
+  %gep3 = getelementptr <4 x i8>, ptr addrspace(1) %src3, i32 %idx
+  %vec3 = load <4 x i8>, ptr addrspace(1) %gep3
+
+  %v1e0 = extractelement <4 x i8> %vec1, i64 0
+  %cv1e0 = sext i8 %v1e0 to i32
+  %v2e0 = extractelement <4 x i8> %vec2, i64 0
+  %other = sext i16 %badsource to i32
+  %mul1 = mul nuw nsw i32 %cv1e0, %other
+
+  %v1e1 = extractelement <4 x i8> %vec1, i64 1
+  %cv1e1 = sext i8 %v1e1 to i32
+  %v2e1 = extractelement <4 x i8> %vec2, i64 1
+  %cv2e1 = sext i8 %v2e1 to i32
+  %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1
+
+  %v2e2 = extractelement <4 x i8> %vec2, i64 2
+  %cv2e2 = sext i8 %v2e2 to i32
+  %v1e2 = extractelement <4 x i8> %vec1, i64 2
+  %cv1e2 = sext i8 %v1e2 to i32
+  %mul3 = mul nuw nsw i32 %cv1e2, %cv2e2
+
+
+  %acc = load i32, ptr addrspace(1) %dst, align 4
+  %mad1 = add i32 %mul1, %acc
+  %mad2 = add i32 %mad1, %mul2
+  %mad3 = add i32 %mad2, %mul3
+
+  store i32 %mad3, ptr addrspace(1) %dst, align 4
+  ret void
+}
+
+
+define amdgpu_kernel void @idot4_commutative(ptr addrspace(1) %src1,
+; GFX7-LABEL: idot4_commutative:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xf
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    s_mov_b32 s10, 0
+; GFX7-NEXT:    s_mov_b32 s11, s3
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX7-NEXT:    v_mov_b32_e32 v1, 0
+; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_bfe_i32 v1, v2, 0, 8
+; GFX7-NEXT:    v_bfe_i32 v3, v2, 8, 8
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_bfe_i32 v4, v0, 0, 8
+; GFX7-NEXT:    v_bfe_i32 v5, v0, 8, 8
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mad_i32_i24 v1, v1, v4, s4
+; GFX7-NEXT:    v_bfe_i32 v0, v0, 16, 8
+; GFX7-NEXT:    v_bfe_i32 v2, v2, 16, 8
+; GFX7-NEXT:    v_mad_i32_i24 v1, v3, v5, v1
+; GFX7-NEXT:    v_mad_i32_i24 v0, v2, v0, v1
+; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: idot4_commutative:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x3c
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v0, v[0:1]
+; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX8-NEXT:    s_waitcnt vmcnt(1)
+; GFX8-NEXT:    v_bfe_i32 v1, v3, 0, 8
+; GFX8-NEXT:    v_bfe_i32 v4, v3, 8, 8
+; GFX8-NEXT:    v_bfe_i32 v3, v3, 16, 8
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_bfe_i32 v2, v0, 0, 8
+; GFX8-NEXT:    v_bfe_i32 v5, v0, 8, 8
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mad_i32_i24 v1, v1, v2, s2
+; GFX8-NEXT:    v_bfe_i32 v0, v0, 16, 8
+; GFX8-NEXT:    v_mad_i32_i24 v1, v4, v5, v1
+; GFX8-NEXT:    v_mad_i32_i24 v2, v3, v0, v1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    flat_store_dword v[0:1], v2
+; GFX8-NEXT:    s_endpgm
+;
+; GFX9-NODL-LABEL: idot4_commutative:
+; GFX9-NODL:       ; %bb.0: ; %entry
+; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NODL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x3c
+; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[4:5]
+; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[6:7]
+; GFX9-NODL-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NODL-NEXT:    v_bfe_i32 v3, v1, 0, 8
+; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NODL-NEXT:    v_bfe_i32 v4, v2, 0, 8
+; GFX9-NODL-NEXT:    v_mul_i32_i24_sdwa v5, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
+; GFX9-NODL-NEXT:    v_mul_i32_i24_sdwa v1, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
+; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NODL-NEXT:    v_mad_i32_i24 v2, v3, v4, s0
+; GFX9-NODL-NEXT:    v_add3_u32 v1, v2, v5, v1
+; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX9-NODL-NEXT:    s_endpgm
+;
+; GFX9-DL-LABEL: idot4_commutative:
+; GFX9-DL:       ; %bb.0: ; %entry
+; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x3c
+; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-DL-NEXT:    s_mov_b32 s1, 0xc020100
+; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-DL-NEXT:    global_load_dword v2, v0, s[4:5]
+; GFX9-DL-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-DL-NEXT:    v_perm_b32 v1, v1, v1, s1
+; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-DL-NEXT:    v_perm_b32 v2, v2, v2, s1
+; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT:    v_dot4_i32_i8 v1, v2, v1, s0
+; GFX9-DL-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX9-DL-NEXT:    s_endpgm
+;
+; GFX10-DL-LABEL: idot4_commutative:
+; GFX10-DL:       ; %bb.0: ; %entry
+; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x3c
+; GFX10-DL-NEXT:    v_mov_b32_e32 v3, 0
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_clause 0x1
+; GFX10-DL-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX10-DL-NEXT:    global_load_dword v2, v0, s[4:5]
+; GFX10-DL-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-DL-NEXT:    v_perm_b32 v0, v1, v1, 0xc020100
+; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DL-NEXT:    v_perm_b32 v1, v2, v2, 0xc020100
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-DL-NEXT:    v_dot4c_i32_i8_e32 v2, v1, v0
+; GFX10-DL-NEXT:    global_store_dword v3, v2, s[0:1]
+; GFX10-DL-NEXT:    s_endpgm
+;
+; GFX11-DL-LABEL: idot4_commutative:
+; GFX11-DL:       ; %bb.0: ; %entry
+; GFX11-DL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-DL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x3c
+; GFX11-DL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_clause 0x1
+; GFX11-DL-NEXT:    global_load_b32 v1, v0, s[6:7]
+; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[4:5]
+; GFX11-DL-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-DL-NEXT:    v_perm_b32 v1, v1, v1, 0xc020100
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DL-NEXT:    v_perm_b32 v0, v0, v0, 0xc020100
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DL-NEXT:    v_dot4_i32_iu8 v0, v0, v1, s2 neg_lo:[1,1,0]
+; GFX11-DL-NEXT:    global_store_b32 v2, v0, s[0:1]
+; GFX11-DL-NEXT:    s_nop 0
+; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DL-NEXT:    s_endpgm
+                                       ptr addrspace(1) %src2,
+                                       ptr addrspace(1) %src3,
+                                       ptr addrspace(1) nocapture %dst) {
+entry:
+  %idx = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
+  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
+  %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
+  %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
+  %gep3 = getelementptr <4 x i8>, ptr addrspace(1) %src3, i32 %idx
+  %vec3 = load <4 x i8>, ptr addrspace(1) %gep3
+
+  %v1e0 = extractelement <4 x i8> %vec1, i64 0
+  %cv1e0 = sext i8 %v1e0 to i32
+  %v2e0 = extractelement <4 x i8> %vec2, i64 0
+  %cv2e0 = sext i8 %v2e0 to i32
+  %mul1 = mul nuw nsw i32 %cv1e0, %cv2e0
+
+  %v1e1 = extractelement <4 x i8> %vec1, i64 1
+  %cv1e1 = sext i8 %v1e1 to i32
+  %v2e1 = extractelement <4 x i8> %vec2, i64 1
+  %cv2e1 = sext i8 %v2e1 to i32
+  %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1
+
+  %v2e2 = extractelement <4 x i8> %vec2, i64 2
+  %cv2e2 = sext i8 %v2e2 to i32
+  %v1e2 = extractelement <4 x i8> %vec1, i64 2
+  %cv1e2 = sext i8 %v1e2 to i32
+  %mul3 = mul nuw nsw i32 %cv1e2, %cv2e2
+
+
+  %acc = load i32, ptr addrspace(1) %dst, align 4
+  %mad1 = add i32 %mul1, %acc
+  %mad2 = add i32 %mad1, %mul2
+  %mad3 = add i32 %mad2, %mul3
+
+  store i32 %mad3, ptr addrspace(1) %dst, align 4
+  ret void
+}
+
+define amdgpu_kernel void @idot4_acc32_3src_3ele_src0(ptr addrspace(1) %src1,
+; GFX7-LABEL: idot4_acc32_3src_3ele_src0:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; GFX7-NEXT:    s_mov_b32 s11, 0xf000
+; GFX7-NEXT:    s_mov_b32 s14, 0
+; GFX7-NEXT:    s_mov_b32 s15, s11
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX7-NEXT:    v_mov_b32_e32 v1, 0
+; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[12:15], 0 addr64
+; GFX7-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64
+; GFX7-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[12:15], 0 addr64
+; GFX7-NEXT:    s_load_dword s0, s[6:7], 0x0
+; GFX7-NEXT:    s_mov_b32 s10, -1
+; GFX7-NEXT:    s_mov_b32 s8, s6
+; GFX7-NEXT:    s_mov_b32 s9, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(2)
+; GFX7-NEXT:    v_bfe_i32 v1, v2, 8, 8
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_bfe_i32 v2, v3, 8, 8
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mad_i32_i24 v4, v2, v2, s0
+; GFX7-NEXT:    v_bfe_i32 v3, v3, 16, 8
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_bfe_i32 v0, v0, 16, 8
+; GFX7-NEXT:    v_mad_i32_i24 v1, v1, v2, v4
+; GFX7-NEXT:    v_mad_i32_i24 v0, v3, v0, v1
+; GFX7-NEXT:    buffer_store_dword v0, off, s[8:11], 0
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: idot4_acc32_3src_3ele_src0:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v1, s3
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v0, v[0:1]
+; GFX8-NEXT:    s_load_dword s0, s[6:7], 0x0
+; GFX8-NEXT:    s_waitcnt vmcnt(2)
+; GFX8-NEXT:    v_bfe_i32 v2, v3, 8, 8
+; GFX8-NEXT:    s_waitcnt vmcnt(1)
+; GFX8-NEXT:    v_bfe_i32 v1, v4, 8, 8
+; GFX8-NEXT:    v_bfe_i32 v3, v4, 16, 8
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mad_i32_i24 v4, v1, v1, s0
+; GFX8-NEXT:    v_mad_i32_i24 v1, v2, v1, v4
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_bfe_i32 v0, v0, 16, 8
+; GFX8-NEXT:    v_mad_i32_i24 v2, v3, v0, v1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    flat_store_dword v[0:1], v2
+; GFX8-NEXT:    s_endpgm
+;
+; GFX9-NODL-LABEL: idot4_acc32_3src_3ele_src0:
+; GFX9-NODL:       ; %bb.0: ; %entry
+; GFX9-NODL-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[4:5]
+; GFX9-NODL-NEXT:    global_load_dword v3, v0, s[0:1]
+; GFX9-NODL-NEXT:    s_load_dword s0, s[6:7], 0x0
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NODL-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NODL-NEXT:    v_bfe_i32 v4, v1, 8, 8
+; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NODL-NEXT:    v_mul_i32_i24_sdwa v1, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
+; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NODL-NEXT:    v_mul_i32_i24_sdwa v2, sext(v3), v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NODL-NEXT:    v_mad_i32_i24 v3, v4, v4, s0
+; GFX9-NODL-NEXT:    v_add3_u32 v1, v3, v2, v1
+; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX9-NODL-NEXT:    s_endpgm
+;
+; GFX9-DL-LABEL: idot4_acc32_3src_3ele_src0:
+; GFX9-DL:       ; %bb.0: ; %entry
+; GFX9-DL-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
+; GFX9-DL-NEXT:    global_load_dword v2, v0, s[0:1]
+; GFX9-DL-NEXT:    global_load_dword v3, v0, s[2:3]
+; GFX9-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
+; GFX9-DL-NEXT:    s_mov_b32 s0, 0xc06010c
+; GFX9-DL-NEXT:    s_mov_b32 s1, 0xc0c0c01
+; GFX9-DL-NEXT:    s_mov_b32 s2, 0xc020101
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-DL-NEXT:    v_perm_b32 v1, v1, v2, s0
+; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-DL-NEXT:    v_perm_b32 v2, v3, v3, s1
+; GFX9-DL-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX9-DL-NEXT:    v_perm_b32 v2, v3, v3, s2
+; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT:    v_dot4_i32_i8 v1, v2, v1, s3
+; GFX9-DL-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX9-DL-NEXT:    s_endpgm
+;
+; GFX10-DL-LABEL: idot4_acc32_3src_3ele_src0:
+; GFX10-DL:       ; %bb.0: ; %entry
+; GFX10-DL-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_clause 0x2
+; GFX10-DL-NEXT:    global_load_dword v1, v0, s[4:5]
+; GFX10-DL-NEXT:    global_load_dword v2, v0, s[0:1]
+; GFX10-DL-NEXT:    global_load_dword v3, v0, s[2:3]
+; GFX10-DL-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-DL-NEXT:    s_load_dword s0, s[6:7], 0x0
+; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-DL-NEXT:    v_perm_b32 v0, v1, v2, 0xc06010c
+; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DL-NEXT:    v_perm_b32 v1, v3, v3, 0xc0c0c01
+; GFX10-DL-NEXT:    v_perm_b32 v2, v3, v3, 0xc020101
+; GFX10-DL-NEXT:    v_mov_b32_e32 v3, 0
+; GFX10-DL-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s0
+; GFX10-DL-NEXT:    v_dot4c_i32_i8_e32 v1, v2, v0
+; GFX10-DL-NEXT:    global_store_dword v3, v1, s[6:7]
+; GFX10-DL-NEXT:    s_endpgm
+;
+; GFX11-DL-LABEL: idot4_acc32_3src_3ele_src0:
+; GFX11-DL:       ; %bb.0: ; %entry
+; GFX11-DL-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_clause 0x2
+; GFX11-DL-NEXT:    global_load_b32 v1, v0, s[4:5]
+; GFX11-DL-NEXT:    global_load_b32 v2, v0, s[0:1]
+; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[2:3]
+; GFX11-DL-NEXT:    s_load_b32 s0, s[6:7], 0x0
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-DL-NEXT:    v_perm_b32 v1, v1, v2, 0xc06010c
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DL-NEXT:    v_perm_b32 v2, v0, v0, 0xc0c0c01
+; GFX11-DL-NEXT:    v_perm_b32 v0, v0, v0, 0xc020101
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-DL-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX11-DL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    v_dot4_i32_iu8 v0, v0, v1, s0 neg_lo:[1,1,0]
+; GFX11-DL-NEXT:    global_store_b32 v2, v0, s[6:7]
+; GFX11-DL-NEXT:    s_nop 0
+; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DL-NEXT:    s_endpgm
+                                       ptr addrspace(1) %src2,
+                                       ptr addrspace(1) %src3,
+                                       ptr addrspace(1) nocapture %dst) {
+entry:
+  %idx = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
+  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
+  %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
+  %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
+  %gep3 = getelementptr <4 x i8>, ptr addrspace(1) %src3, i32 %idx
+  %vec3 = load <4 x i8>, ptr addrspace(1) %gep3
+
+  %v2e0 = extractelement <4 x i8> %vec2, i64 1
+  %cv2e0 = sext i8 %v2e0 to i32
+  %mul1 = mul nuw nsw i32 %cv2e0, %cv2e0
+
+  %v1e1 = extractelement <4 x i8> %vec1, i64 1
+  %cv1e1 = sext i8 %v1e1 to i32
+  %v2e1 = extractelement <4 x i8> %vec2, i64 1
+  %cv2e1 = sext i8 %v2e1 to i32
+  %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1
+
+  %v3e2 = extractelement <4 x i8> %vec3, i64 2
+  %cv3e2 = sext i8 %v3e2 to i32
+  %v2e2 = extractelement <4 x i8> %vec2, i64 2
+  %cv2e2 = sext i8 %v2e2 to i32
+  %mul3 = mul nuw nsw i32 %cv2e2, %cv3e2
+
+
+  %acc = load i32, ptr addrspace(1) %dst, align 4
+  %mad1 = add i32 %mul1, %acc
+  %mad2 = add i32 %mad1, %mul2
+  %mad3 = add i32 %mad2, %mul3
+
+  store i32 %mad3, ptr addrspace(1) %dst, align 4
+  ret void
+}
+
+define amdgpu_kernel void @idot4_4src(ptr addrspace(1) %src1,
+; GFX7-LABEL: idot4_4src:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x9
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    s_mov_b32 s14, 0
+; GFX7-NEXT:    s_mov_b32 s15, s3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; GFX7-NEXT:    v_mov_b32_e32 v1, 0
+; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[12:15], 0 addr64
+; GFX7-NEXT:    s_mov_b64 s[12:13], s[6:7]
+; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64
+; GFX7-NEXT:    s_mov_b64 s[12:13], s[8:9]
+; GFX7-NEXT:    buffer_load_dword v4, v[0:1], s[12:15], 0 addr64
+; GFX7-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[12:15], 0 addr64
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x11
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX7-NEXT:    s_waitcnt vmcnt(3)
+; GFX7-NEXT:    v_bfe_i32 v1, v2, 0, 8
+; GFX7-NEXT:    v_bfe_i32 v2, v2, 8, 8
+; GFX7-NEXT:    s_waitcnt vmcnt(2)
+; GFX7-NEXT:    v_bfe_i32 v5, v3, 0, 8
+; GFX7-NEXT:    v_bfe_i32 v3, v3, 8, 8
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mad_i32_i24 v1, v1, v2, s4
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_bfe_i32 v2, v4, 0, 8
+; GFX7-NEXT:    v_bfe_i32 v4, v4, 8, 8
+; GFX7-NEXT:    v_mad_i32_i24 v1, v5, v3, v1
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_bfe_i32 v3, v0, 0, 8
+; GFX7-NEXT:    v_bfe_i32 v0, v0, 8, 8
+; GFX7-NEXT:    v_mad_i32_i24 v1, v2, v4, v1
+; GFX7-NEXT:    v_mad_i32_i24 v0, v3, v0, v1
+; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: idot4_4src:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x24
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x44
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v1, s9
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s8, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v5, v[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v1, s11
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s10, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v0, v[0:1]
+; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX8-NEXT:    s_waitcnt vmcnt(3)
+; GFX8-NEXT:    v_bfe_i32 v1, v3, 0, 8
+; GFX8-NEXT:    v_bfe_i32 v2, v3, 8, 8
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mad_i32_i24 v1, v1, v2, s2
+; GFX8-NEXT:    s_waitcnt vmcnt(2)
+; GFX8-NEXT:    v_bfe_i32 v3, v4, 0, 8
+; GFX8-NEXT:    v_bfe_i32 v4, v4, 8, 8
+; GFX8-NEXT:    v_mad_i32_i24 v1, v3, v4, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(1)
+; GFX8-NEXT:    v_bfe_i32 v6, v5, 0, 8
+; GFX8-NEXT:    v_bfe_i32 v5, v5, 8, 8
+; GFX8-NEXT:    v_mad_i32_i24 v1, v6, v5, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_bfe_i32 v7, v0, 0, 8
+; GFX8-NEXT:    v_bfe_i32 v0, v0, 8, 8
+; GFX8-NEXT:    v_mad_i32_i24 v2, v7, v0, v1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    flat_store_dword v[0:1], v2
+; GFX8-NEXT:    s_endpgm
+;
+; GFX9-NODL-LABEL: idot4_4src:
+; GFX9-NODL:       ; %bb.0: ; %entry
+; GFX9-NODL-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x24
+; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NODL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x44
+; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[4:5]
+; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[6:7]
+; GFX9-NODL-NEXT:    global_load_dword v3, v0, s[8:9]
+; GFX9-NODL-NEXT:    global_load_dword v4, v0, s[10:11]
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NODL-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX9-NODL-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-NODL-NEXT:    v_mul_i32_i24_sdwa v1, sext(v1), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_1
+; GFX9-NODL-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NODL-NEXT:    v_mul_i32_i24_sdwa v2, sext(v2), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_1
+; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NODL-NEXT:    v_mul_i32_i24_sdwa v3, sext(v3), sext(v3) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_1
+; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NODL-NEXT:    v_mul_i32_i24_sdwa v4, sext(v4), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_1
+; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NODL-NEXT:    v_add3_u32 v1, v1, s2, v2
+; GFX9-NODL-NEXT:    v_add3_u32 v1, v1, v3, v4
+; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NODL-NEXT:    s_endpgm
+;
+; GFX9-DL-LABEL: idot4_4src:
+; GFX9-DL:       ; %bb.0: ; %entry
+; GFX9-DL-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x24
+; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x44
+; GFX9-DL-NEXT:    s_mov_b32 s2, 0xc0c0501
+; GFX9-DL-NEXT:    s_mov_b32 s3, 0x5010c0c
+; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
+; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
+; GFX9-DL-NEXT:    global_load_dword v3, v0, s[8:9]
+; GFX9-DL-NEXT:    global_load_dword v4, v0, s[10:11]
+; GFX9-DL-NEXT:    s_mov_b32 s4, 0xc0c0400
+; GFX9-DL-NEXT:    s_load_dword s6, s[0:1], 0x0
+; GFX9-DL-NEXT:    s_mov_b32 s5, 0x4000c0c
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-DL-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-DL-NEXT:    v_perm_b32 v5, v2, v1, s2
+; GFX9-DL-NEXT:    v_perm_b32 v1, v2, v1, s4
+; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-DL-NEXT:    v_perm_b32 v6, v4, v3, s3
+; GFX9-DL-NEXT:    v_perm_b32 v2, v4, v3, s5
+; GFX9-DL-NEXT:    v_or_b32_e32 v3, v6, v5
+; GFX9-DL-NEXT:    v_or_b32_e32 v1, v2, v1
+; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT:    v_dot4_i32_i8 v1, v1, v3, s6
+; GFX9-DL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-DL-NEXT:    s_endpgm
+;
+; GFX10-DL-LABEL: idot4_4src:
+; GFX10-DL:       ; %bb.0: ; %entry
+; GFX10-DL-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x24
+; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x44
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_clause 0x3
+; GFX10-DL-NEXT:    global_load_dword v1, v0, s[4:5]
+; GFX10-DL-NEXT:    global_load_dword v2, v0, s[6:7]
+; GFX10-DL-NEXT:    global_load_dword v3, v0, s[8:9]
+; GFX10-DL-NEXT:    global_load_dword v4, v0, s[10:11]
+; GFX10-DL-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NEXT:    s_waitcnt vmcnt(2)
+; GFX10-DL-NEXT:    v_perm_b32 v0, v2, v1, 0xc0c0501
+; GFX10-DL-NEXT:    v_perm_b32 v1, v2, v1, 0xc0c0400
+; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DL-NEXT:    v_perm_b32 v5, v4, v3, 0x5010c0c
+; GFX10-DL-NEXT:    v_perm_b32 v2, v4, v3, 0x4000c0c
+; GFX10-DL-NEXT:    v_mov_b32_e32 v3, 0
+; GFX10-DL-NEXT:    v_or_b32_e32 v0, v5, v0
+; GFX10-DL-NEXT:    v_or_b32_e32 v1, v2, v1
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-DL-NEXT:    v_dot4c_i32_i8_e32 v2, v1, v0
+; GFX10-DL-NEXT:    global_store_dword v3, v2, s[0:1]
+; GFX10-DL-NEXT:    s_endpgm
+;
+; GFX11-DL-LABEL: idot4_4src:
+; GFX11-DL:       ; %bb.0: ; %entry
+; GFX11-DL-NEXT:    s_load_b256 s[4:11], s[0:1], 0x24
+; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-DL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x44
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_clause 0x3
+; GFX11-DL-NEXT:    global_load_b32 v1, v0, s[4:5]
+; GFX11-DL-NEXT:    global_load_b32 v2, v0, s[6:7]
+; GFX11-DL-NEXT:    global_load_b32 v3, v0, s[8:9]
+; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[10:11]
+; GFX11-DL-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-DL-NEXT:    v_perm_b32 v4, v2, v1, 0xc0c0501
+; GFX11-DL-NEXT:    v_perm_b32 v1, v2, v1, 0xc0c0400
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DL-NEXT:    v_perm_b32 v5, v0, v3, 0x5010c0c
+; GFX11-DL-NEXT:    v_perm_b32 v0, v0, v3, 0x4000c0c
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-DL-NEXT:    v_or_b32_e32 v2, v5, v4
+; GFX11-DL-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-DL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-DL-NEXT:    v_dot4_i32_iu8 v0, v0, v2, s2 neg_lo:[1,1,0]
+; GFX11-DL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-DL-NEXT:    s_nop 0
+; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DL-NEXT:    s_endpgm
+                                       ptr addrspace(1) %src2,
+                                       ptr addrspace(1) %src3,
+                                       ptr addrspace(1) %src4,
+                                       ptr addrspace(1) nocapture %dst) {
+entry:
+  %idx = call i32 @llvm.amdgcn.workitem.id.x()
+
+  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
+  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
+  %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
+  %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
+  %gep3 = getelementptr <4 x i8>, ptr addrspace(1) %src3, i32 %idx
+  %vec3 = load <4 x i8>, ptr addrspace(1) %gep3
+  %gep4 = getelementptr <4 x i8>, ptr addrspace(1) %src4, i32 %idx
+  %vec4 = load <4 x i8>, ptr addrspace(1) %gep4
+
+
+  %v1e0 = extractelement <4 x i8> %vec1, i64 0
+  %cv1e0 = sext i8 %v1e0 to i32
+  %v1e1 = extractelement <4 x i8> %vec1, i64 1
+  %cv1e1 = sext i8 %v1e1 to i32
+  %mul1 = mul nuw nsw i32 %cv1e0, %cv1e1
+
+  %v2e0 = extractelement <4 x i8> %vec2, i64 0
+  %cv2e0 = sext i8 %v2e0 to i32
+  %v2e1 = extractelement <4 x i8> %vec2, i64 1
+  %cv2e1 = sext i8 %v2e1 to i32
+  %mul2 = mul nuw nsw i32 %cv2e0, %cv2e1
+
+  %v3e0 = extractelement <4 x i8> %vec3, i64 0
+  %cv3e0 = sext i8 %v3e0 to i32
+  %v3e1 = extractelement <4 x i8> %vec3, i64 1
+  %cv3e1 = sext i8 %v3e1 to i32
+  %mul3 = mul nuw nsw i32 %cv3e0, %cv3e1
+
+  %v4e0 = extractelement <4 x i8> %vec4, i64 0
+  %cv4e0 = sext i8 %v4e0 to i32
+  %v4e1 = extractelement <4 x i8> %vec4, i64 1
+  %cv4e1 = sext i8 %v4e1 to i32
+  %mul4 = mul nuw nsw i32 %cv4e0, %cv4e1
+
+
+  %acc = load i32, ptr addrspace(1) %dst, align 4
+  %mad1 = add i32 %mul1, %acc
+  %mad2 = add i32 %mad1, %mul2
+  %mad3 = add i32 %mad2, %mul3
+  %mad4 = add i32 %mad3, %mul4
+
+  store i32 %mad4, ptr addrspace(1) %dst, align 4
+  ret void
+}
+
+define amdgpu_kernel void @idot4_nonstandard_signed(ptr addrspace(1) %src1,
+; GFX7-LABEL: idot4_nonstandard_signed:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    s_mov_b32 s10, 0
+; GFX7-NEXT:    s_mov_b32 s11, s3
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX7-NEXT:    v_mov_b32_e32 v1, 0
+; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_bfe_i32 v1, v2, 0, 8
+; GFX7-NEXT:    v_bfe_i32 v3, v2, 8, 8
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xff, v0
+; GFX7-NEXT:    v_bfe_i32 v4, v2, 16, 8
+; GFX7-NEXT:    v_bfe_u32 v6, v0, 8, 8
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX7-NEXT:    v_mul_u32_u24_e32 v1, v1, v5
+; GFX7-NEXT:    v_ashrrev_i32_e32 v2, 24, v2
+; GFX7-NEXT:    v_bfe_u32 v7, v0, 16, 8
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX7-NEXT:    v_mad_u32_u24 v1, v6, v3, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
+; GFX7-NEXT:    v_mad_u32_u24 v1, v7, v4, v1
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, v1
+; GFX7-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: idot4_nonstandard_signed:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT:    v_mov_b32_e32 v4, 0xff
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v2, v[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    s_waitcnt vmcnt(1)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 8, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
+; GFX8-NEXT:    v_bfe_i32 v7, v7, 0, 8
+; GFX8-NEXT:    v_bfe_i32 v5, v5, 0, 8
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 8, v2
+; GFX8-NEXT:    v_mul_lo_u16_sdwa v6, sext(v3), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX8-NEXT:    v_and_b32_e32 v8, 0xff, v8
+; GFX8-NEXT:    v_and_b32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 24, v3
+; GFX8-NEXT:    v_mad_u16 v6, v8, v7, v6
+; GFX8-NEXT:    v_bfe_i32 v3, v3, 0, 8
+; GFX8-NEXT:    v_mad_u16 v4, v4, v5, v6
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
+; GFX8-NEXT:    v_mad_u16 v2, v3, v2, v4
+; GFX8-NEXT:    v_bfe_i32 v2, v2, 0, 16
+; GFX8-NEXT:    flat_store_dword v[0:1], v2
+; GFX8-NEXT:    s_endpgm
+;
+; GFX9-NODL-LABEL: idot4_nonstandard_signed:
+; GFX9-NODL:       ; %bb.0: ; %entry
+; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NODL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NODL-NEXT:    s_movk_i32 s0, 0xff
+; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[4:5]
+; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[6:7]
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
+; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v6, 8, v2
+; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX9-NODL-NEXT:    v_mul_lo_u16_sdwa v4, sext(v1), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX9-NODL-NEXT:    v_bfe_i32 v5, v5, 0, 8
+; GFX9-NODL-NEXT:    v_and_b32_e32 v6, 0xff, v6
+; GFX9-NODL-NEXT:    v_and_b32_sdwa v7, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v1, 24, v1
+; GFX9-NODL-NEXT:    v_bfe_i32 v3, v3, 0, 8
+; GFX9-NODL-NEXT:    v_mad_legacy_u16 v4, v6, v5, v4
+; GFX9-NODL-NEXT:    v_bfe_i32 v1, v1, 0, 8
+; GFX9-NODL-NEXT:    v_mad_legacy_u16 v3, v7, v3, v4
+; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
+; GFX9-NODL-NEXT:    v_mad_legacy_u16 v1, v1, v2, v3
+; GFX9-NODL-NEXT:    v_bfe_i32 v1, v1, 0, 16
+; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX9-NODL-NEXT:    s_endpgm
+;
+; GFX9-DL-LABEL: idot4_nonstandard_signed:
+; GFX9-DL:       ; %bb.0: ; %entry
+; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-DL-NEXT:    s_movk_i32 s0, 0xff
+; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
+; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
+; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v6, 8, v2
+; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX9-DL-NEXT:    v_mul_lo_u16_sdwa v4, sext(v1), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX9-DL-NEXT:    v_bfe_i32 v5, v5, 0, 8
+; GFX9-DL-NEXT:    v_and_b32_e32 v6, 0xff, v6
+; GFX9-DL-NEXT:    v_and_b32_sdwa v7, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v1, 24, v1
+; GFX9-DL-NEXT:    v_bfe_i32 v3, v3, 0, 8
+; GFX9-DL-NEXT:    v_mad_legacy_u16 v4, v6, v5, v4
+; GFX9-DL-NEXT:    v_bfe_i32 v1, v1, 0, 8
+; GFX9-DL-NEXT:    v_mad_legacy_u16 v3, v7, v3, v4
+; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
+; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v1, v2, v3
+; GFX9-DL-NEXT:    v_bfe_i32 v1, v1, 0, 16
+; GFX9-DL-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX9-DL-NEXT:    s_endpgm
+;
+; GFX10-DL-LABEL: idot4_nonstandard_signed:
+; GFX10-DL:       ; %bb.0: ; %entry
+; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v6, 0xff
+; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_clause 0x1
+; GFX10-DL-NEXT:    global_load_dword v1, v0, s[4:5]
+; GFX10-DL-NEXT:    global_load_dword v2, v0, s[6:7]
+; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-DL-NEXT:    v_bfe_i32 v0, v1, 0, 8
+; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DL-NEXT:    v_and_b32_e32 v3, 0xff, v2
+; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v4, 8, v1
+; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v5, 8, v2
+; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
+; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v1, 24, v1
+; GFX10-DL-NEXT:    v_mul_lo_u16 v0, v0, v3
+; GFX10-DL-NEXT:    v_bfe_i32 v3, v4, 0, 8
+; GFX10-DL-NEXT:    v_and_b32_e32 v4, 0xff, v5
+; GFX10-DL-NEXT:    v_and_b32_sdwa v5, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-DL-NEXT:    v_bfe_i32 v6, v7, 0, 8
+; GFX10-DL-NEXT:    v_bfe_i32 v1, v1, 0, 8
+; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
+; GFX10-DL-NEXT:    v_mad_u16 v0, v4, v3, v0
+; GFX10-DL-NEXT:    v_mad_u16 v0, v5, v6, v0
+; GFX10-DL-NEXT:    v_mad_u16 v0, v1, v2, v0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-DL-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; GFX10-DL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-DL-NEXT:    s_endpgm
+;
+; GFX11-DL-LABEL: idot4_nonstandard_signed:
+; GFX11-DL:       ; %bb.0: ; %entry
+; GFX11-DL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-DL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_clause 0x1
+; GFX11-DL-NEXT:    global_load_b32 v1, v0, s[4:5]
+; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[6:7]
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-DL-NEXT:    v_bfe_i32 v2, v1, 0, 8
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DL-NEXT:    v_and_b32_e32 v3, 0xff, v0
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v4, 8, v1
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v5, 8, v0
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v7, 16, v0
+; GFX11-DL-NEXT:    v_mul_lo_u16 v2, v2, v3
+; GFX11-DL-NEXT:    v_bfe_i32 v3, v4, 0, 8
+; GFX11-DL-NEXT:    v_and_b32_e32 v4, 0xff, v5
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v1, 24, v1
+; GFX11-DL-NEXT:    v_bfe_i32 v5, v6, 0, 8
+; GFX11-DL-NEXT:    v_and_b32_e32 v6, 0xff, v7
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
+; GFX11-DL-NEXT:    v_mad_u16 v2, v4, v3, v2
+; GFX11-DL-NEXT:    v_bfe_i32 v1, v1, 0, 8
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-DL-NEXT:    v_mad_u16 v2, v6, v5, v2
+; GFX11-DL-NEXT:    v_mad_u16 v0, v1, v0, v2
+; GFX11-DL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-DL-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; GFX11-DL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-DL-NEXT:    s_nop 0
+; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DL-NEXT:    s_endpgm
+                                       ptr addrspace(1) %src2,
+                                       ptr addrspace(1) nocapture %dst) {
+entry:
+  %idx = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
+  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
+  %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
+  %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
+  %v1e0 = extractelement <4 x i8> %vec1, i64 0
+  %v1e0e = sext i8 %v1e0 to i16
+  %v2e0 = extractelement <4 x i8> %vec2, i64 0
+  %v2e0e = zext i8 %v2e0 to i16
+  %mul0 = mul nsw i16 %v1e0e, %v2e0e
+  %add0 = add i16 %mul0, 0
+
+  %v1e1 = extractelement <4 x i8> %vec1, i64 1
+  %v1e1e = sext i8 %v1e1 to i16
+  %v2e1 = extractelement <4 x i8> %vec2, i64 1
+  %v2e1e = zext i8 %v2e1 to i16
+  %mul1 = mul nsw i16 %v2e1e, %v1e1e
+  %add1 = add i16 %mul1, %add0
+  %v1e2 = extractelement <4 x i8> %vec1, i64 2
+  %v1e2e = sext i8 %v1e2 to i16
+  %v2e2 = extractelement <4 x i8> %vec2, i64 2
+  %v2e2e = zext i8 %v2e2 to i16
+  %mul2 = mul nsw i16 %v2e2e, %v1e2e
+  %add2 = add i16 %mul2, %add1
+  %v1e3 = extractelement <4 x i8> %vec1, i64 3
+  %v1e3e = sext i8 %v1e3 to i16
+  %v2e3 = extractelement <4 x i8> %vec2, i64 3
+  %v2e3e = zext i8 %v2e3 to i16
+  %mul3 = mul nsw i16 %v1e3e, %v2e3e
+  %add3 = add i16 %mul3, %add2
+  %res = sext i16 %add3 to i32
+  store i32 %res, ptr addrspace(1) %dst, align 4
+  ret void
+}
+
+
 declare i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/idot4u.ll b/llvm/test/CodeGen/AMDGPU/idot4u.ll
index b7821f8fd6da511..a82c5215f3b2c65 100644
--- a/llvm/test/CodeGen/AMDGPU/idot4u.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot4u.ll
@@ -5,6 +5,7 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9-DL %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-DL %s
 
 define amdgpu_kernel void @udot4_acc32(ptr addrspace(1) %src1,
 ; GFX7-LABEL: udot4_acc32:
@@ -127,6 +128,24 @@ define amdgpu_kernel void @udot4_acc32(ptr addrspace(1) %src1,
 ; GFX10-DL-NEXT:    v_dot4_u32_u8 v1, v1, v2, s2
 ; GFX10-DL-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-DL-NEXT:    s_endpgm
+;
+; GFX11-DL-LABEL: udot4_acc32:
+; GFX11-DL:       ; %bb.0: ; %entry
+; GFX11-DL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-DL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-DL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_clause 0x1
+; GFX11-DL-NEXT:    global_load_b32 v1, v0, s[4:5]
+; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[6:7]
+; GFX11-DL-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-DL-NEXT:    v_dot4_u32_u8 v0, v1, v0, s2
+; GFX11-DL-NEXT:    global_store_b32 v2, v0, s[0:1]
+; GFX11-DL-NEXT:    s_nop 0
+; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DL-NEXT:    s_endpgm
                                        ptr addrspace(1) %src2,
                                        ptr addrspace(1) nocapture %dst) {
 entry:
@@ -279,30 +298,14 @@ define amdgpu_kernel void @udot4_acc16(ptr addrspace(1) %src1,
 ; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
 ; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-DL-NEXT:    s_movk_i32 s0, 0xff
+; GFX9-DL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
-; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
-; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-DL-NEXT:    global_load_ushort v3, v0, s[2:3]
-; GFX9-DL-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-DL-NEXT:    v_and_b32_e32 v4, 0xff, v1
-; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-DL-NEXT:    v_and_b32_e32 v5, 0xff, v2
-; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v6, 8, v1
-; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v7, 8, v2
-; GFX9-DL-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX9-DL-NEXT:    v_and_b32_e32 v7, 0xff, v7
+; GFX9-DL-NEXT:    global_load_dword v2, v0, s[4:5]
+; GFX9-DL-NEXT:    global_load_dword v3, v0, s[6:7]
+; GFX9-DL-NEXT:    global_load_ushort v4, v1, s[2:3]
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT:    v_mad_legacy_u16 v3, v4, v5, v3
-; GFX9-DL-NEXT:    v_and_b32_sdwa v8, v1, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-DL-NEXT:    v_and_b32_sdwa v9, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-DL-NEXT:    v_mad_legacy_u16 v3, v6, v7, v3
-; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v1, 24, v1
-; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
-; GFX9-DL-NEXT:    v_mad_legacy_u16 v3, v8, v9, v3
-; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v1, v2, v3
-; GFX9-DL-NEXT:    global_store_short v0, v1, s[2:3]
+; GFX9-DL-NEXT:    v_dot4_u32_u8 v0, v2, v3, v4
+; GFX9-DL-NEXT:    global_store_short v1, v0, s[2:3]
 ; GFX9-DL-NEXT:    s_endpgm
 ;
 ; GFX10-DL-LABEL: udot4_acc16:
@@ -311,32 +314,34 @@ define amdgpu_kernel void @udot4_acc16(ptr addrspace(1) %src1,
 ; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
 ; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT:    v_mov_b32_e32 v8, 0xff
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    s_clause 0x1
-; GFX10-DL-NEXT:    global_load_dword v1, v0, s[4:5]
-; GFX10-DL-NEXT:    global_load_dword v2, v0, s[6:7]
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX10-DL-NEXT:    global_load_ushort v3, v0, s[2:3]
-; GFX10-DL-NEXT:    s_waitcnt vmcnt(2)
-; GFX10-DL-NEXT:    v_and_b32_e32 v4, 0xff, v1
-; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
-; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v6, 8, v2
-; GFX10-DL-NEXT:    v_and_b32_e32 v7, 0xff, v2
-; GFX10-DL-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX10-DL-NEXT:    v_and_b32_e32 v6, 0xff, v6
+; GFX10-DL-NEXT:    global_load_dword v2, v0, s[4:5]
+; GFX10-DL-NEXT:    global_load_dword v3, v0, s[6:7]
+; GFX10-DL-NEXT:    global_load_ushort v4, v1, s[2:3]
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT:    v_mad_u16 v3, v4, v7, v3
-; GFX10-DL-NEXT:    v_and_b32_sdwa v4, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-DL-NEXT:    v_and_b32_sdwa v7, v2, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v1, 24, v1
-; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
-; GFX10-DL-NEXT:    v_mad_u16 v3, v5, v6, v3
-; GFX10-DL-NEXT:    v_mad_u16 v3, v4, v7, v3
-; GFX10-DL-NEXT:    v_mad_u16 v1, v1, v2, v3
-; GFX10-DL-NEXT:    global_store_short v0, v1, s[2:3]
+; GFX10-DL-NEXT:    v_dot4_u32_u8 v0, v2, v3, v4
+; GFX10-DL-NEXT:    global_store_short v1, v0, s[2:3]
 ; GFX10-DL-NEXT:    s_endpgm
+;
+; GFX11-DL-LABEL: udot4_acc16:
+; GFX11-DL:       ; %bb.0: ; %entry
+; GFX11-DL-NEXT:    s_clause 0x1
+; GFX11-DL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-DL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_clause 0x1
+; GFX11-DL-NEXT:    global_load_b32 v2, v0, s[4:5]
+; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[6:7]
+; GFX11-DL-NEXT:    global_load_u16 v3, v1, s[0:1]
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DL-NEXT:    v_dot4_u32_u8 v0, v2, v0, v3
+; GFX11-DL-NEXT:    global_store_b16 v1, v0, s[0:1]
+; GFX11-DL-NEXT:    s_nop 0
+; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DL-NEXT:    s_endpgm
                                        ptr addrspace(1) %src2,
                                        ptr addrspace(1) nocapture %dst) {
 entry:
@@ -479,25 +484,14 @@ define amdgpu_kernel void @udot4_acc8(ptr addrspace(1) %src1,
 ; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
 ; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-DL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
-; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
-; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-DL-NEXT:    global_load_ubyte v3, v0, s[2:3]
-; GFX9-DL-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
-; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v6, 8, v1
-; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v7, 8, v2
-; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v8, 24, v1
+; GFX9-DL-NEXT:    global_load_dword v2, v0, s[4:5]
+; GFX9-DL-NEXT:    global_load_dword v3, v0, s[6:7]
+; GFX9-DL-NEXT:    global_load_ubyte v4, v1, s[2:3]
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v1, v2, v3
-; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
-; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v6, v7, v1
-; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v9, 24, v2
-; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v4, v5, v1
-; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v8, v9, v1
-; GFX9-DL-NEXT:    global_store_byte v0, v1, s[2:3]
+; GFX9-DL-NEXT:    v_dot4_u32_u8 v0, v2, v3, v4
+; GFX9-DL-NEXT:    global_store_byte v1, v0, s[2:3]
 ; GFX9-DL-NEXT:    s_endpgm
 ;
 ; GFX10-DL-LABEL: udot4_acc8:
@@ -512,21 +506,28 @@ define amdgpu_kernel void @udot4_acc8(ptr addrspace(1) %src1,
 ; GFX10-DL-NEXT:    global_load_dword v2, v0, s[4:5]
 ; GFX10-DL-NEXT:    global_load_dword v3, v0, s[6:7]
 ; GFX10-DL-NEXT:    global_load_ubyte v4, v1, s[2:3]
-; GFX10-DL-NEXT:    s_waitcnt vmcnt(2)
-; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v0, 8, v2
-; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v5, 8, v3
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT:    v_mad_u16 v4, v2, v3, v4
-; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
-; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
-; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
-; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v3, 24, v3
-; GFX10-DL-NEXT:    v_mad_u16 v0, v0, v5, v4
-; GFX10-DL-NEXT:    v_mad_u16 v0, v6, v7, v0
-; GFX10-DL-NEXT:    v_mad_u16 v0, v2, v3, v0
+; GFX10-DL-NEXT:    v_dot4_u32_u8 v0, v2, v3, v4
 ; GFX10-DL-NEXT:    global_store_byte v1, v0, s[2:3]
 ; GFX10-DL-NEXT:    s_endpgm
+;
+; GFX11-DL-LABEL: udot4_acc8:
+; GFX11-DL:       ; %bb.0: ; %entry
+; GFX11-DL-NEXT:    s_clause 0x1
+; GFX11-DL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-DL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_clause 0x1
+; GFX11-DL-NEXT:    global_load_b32 v2, v0, s[4:5]
+; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[6:7]
+; GFX11-DL-NEXT:    global_load_u8 v3, v1, s[0:1]
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DL-NEXT:    v_dot4_u32_u8 v0, v2, v0, v3
+; GFX11-DL-NEXT:    global_store_b8 v1, v0, s[0:1]
+; GFX11-DL-NEXT:    s_nop 0
+; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DL-NEXT:    s_endpgm
                                       ptr addrspace(1) %src2,
                                       ptr addrspace(1) nocapture %dst) {
 entry:
@@ -562,7 +563,6 @@ entry:
   ret void
 }
 
-; TODO: Generate udot4?
 define amdgpu_kernel void @udot2_8(ptr addrspace(1) %src1,
 ; GFX7-LABEL: udot2_8:
 ; GFX7:       ; %bb.0: ; %entry
@@ -644,19 +644,19 @@ define amdgpu_kernel void @udot2_8(ptr addrspace(1) %src1,
 ; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
 ; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-DL-NEXT:    s_mov_b32 s0, 0xc0c0100
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    global_load_dword v2, v0, s[4:5]
-; GFX9-DL-NEXT:    global_load_dword v3, v0, s[6:7]
-; GFX9-DL-NEXT:    global_load_ubyte v4, v1, s[2:3]
+; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
+; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-DL-NEXT:    global_load_ubyte v3, v0, s[2:3]
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v0, 8, v2
+; GFX9-DL-NEXT:    v_perm_b32 v1, v1, v1, s0
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v5, 8, v3
+; GFX9-DL-NEXT:    v_perm_b32 v2, v2, v2, s0
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT:    v_mad_legacy_u16 v2, v2, v3, v4
-; GFX9-DL-NEXT:    v_mad_legacy_u16 v0, v0, v5, v2
-; GFX9-DL-NEXT:    global_store_byte v1, v0, s[2:3]
+; GFX9-DL-NEXT:    v_dot4_u32_u8 v1, v1, v2, v3
+; GFX9-DL-NEXT:    global_store_byte v0, v1, s[2:3]
 ; GFX9-DL-NEXT:    s_endpgm
 ;
 ; GFX10-DL-LABEL: udot2_8:
@@ -665,21 +665,44 @@ define amdgpu_kernel void @udot2_8(ptr addrspace(1) %src1,
 ; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
 ; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    s_clause 0x1
-; GFX10-DL-NEXT:    global_load_dword v2, v0, s[4:5]
-; GFX10-DL-NEXT:    global_load_dword v3, v0, s[6:7]
-; GFX10-DL-NEXT:    global_load_ubyte v4, v1, s[2:3]
+; GFX10-DL-NEXT:    global_load_dword v1, v0, s[4:5]
+; GFX10-DL-NEXT:    global_load_dword v2, v0, s[6:7]
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-DL-NEXT:    global_load_ubyte v3, v0, s[2:3]
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(2)
-; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v0, 8, v2
+; GFX10-DL-NEXT:    v_perm_b32 v1, v1, v1, 0xc0c0100
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v5, 8, v3
+; GFX10-DL-NEXT:    v_perm_b32 v2, v2, v2, 0xc0c0100
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT:    v_mad_u16 v2, v2, v3, v4
-; GFX10-DL-NEXT:    v_mad_u16 v0, v0, v5, v2
-; GFX10-DL-NEXT:    global_store_byte v1, v0, s[2:3]
+; GFX10-DL-NEXT:    v_dot4_u32_u8 v1, v1, v2, v3
+; GFX10-DL-NEXT:    global_store_byte v0, v1, s[2:3]
 ; GFX10-DL-NEXT:    s_endpgm
+;
+; GFX11-DL-LABEL: udot2_8:
+; GFX11-DL:       ; %bb.0: ; %entry
+; GFX11-DL-NEXT:    s_clause 0x1
+; GFX11-DL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-DL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_clause 0x1
+; GFX11-DL-NEXT:    global_load_b32 v1, v0, s[4:5]
+; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[6:7]
+; GFX11-DL-NEXT:    global_load_u8 v3, v2, s[0:1]
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-DL-NEXT:    v_perm_b32 v1, v1, v1, 0xc0c0100
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-DL-NEXT:    v_perm_b32 v0, v0, v0, 0xc0c0100
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DL-NEXT:    v_dot4_u32_u8 v0, v1, v0, v3
+; GFX11-DL-NEXT:    global_store_b8 v2, v0, s[0:1]
+; GFX11-DL-NEXT:    s_nop 0
+; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DL-NEXT:    s_endpgm
                                    ptr addrspace(1) %src2,
                                    ptr addrspace(1) nocapture %dst) {
 entry:
@@ -803,25 +826,14 @@ define amdgpu_kernel void @udot4_CommutationInsideMAD(ptr addrspace(1) %src1,
 ; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
 ; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-DL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
-; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
-; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-DL-NEXT:    global_load_ubyte v3, v0, s[2:3]
-; GFX9-DL-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
-; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v6, 8, v1
-; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v7, 8, v2
-; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v8, 24, v1
+; GFX9-DL-NEXT:    global_load_dword v2, v0, s[4:5]
+; GFX9-DL-NEXT:    global_load_dword v3, v0, s[6:7]
+; GFX9-DL-NEXT:    global_load_ubyte v4, v1, s[2:3]
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v2, v1, v3
-; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
-; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v7, v6, v1
-; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v9, 24, v2
-; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v5, v4, v1
-; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v9, v8, v1
-; GFX9-DL-NEXT:    global_store_byte v0, v1, s[2:3]
+; GFX9-DL-NEXT:    v_dot4_u32_u8 v0, v3, v2, v4
+; GFX9-DL-NEXT:    global_store_byte v1, v0, s[2:3]
 ; GFX9-DL-NEXT:    s_endpgm
 ;
 ; GFX10-DL-LABEL: udot4_CommutationInsideMAD:
@@ -836,21 +848,28 @@ define amdgpu_kernel void @udot4_CommutationInsideMAD(ptr addrspace(1) %src1,
 ; GFX10-DL-NEXT:    global_load_dword v2, v0, s[4:5]
 ; GFX10-DL-NEXT:    global_load_dword v3, v0, s[6:7]
 ; GFX10-DL-NEXT:    global_load_ubyte v4, v1, s[2:3]
-; GFX10-DL-NEXT:    s_waitcnt vmcnt(2)
-; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v0, 8, v2
-; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v5, 8, v3
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT:    v_mad_u16 v4, v3, v2, v4
-; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
-; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
-; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
-; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v3, 24, v3
-; GFX10-DL-NEXT:    v_mad_u16 v0, v5, v0, v4
-; GFX10-DL-NEXT:    v_mad_u16 v0, v7, v6, v0
-; GFX10-DL-NEXT:    v_mad_u16 v0, v3, v2, v0
+; GFX10-DL-NEXT:    v_dot4_u32_u8 v0, v3, v2, v4
 ; GFX10-DL-NEXT:    global_store_byte v1, v0, s[2:3]
 ; GFX10-DL-NEXT:    s_endpgm
+;
+; GFX11-DL-LABEL: udot4_CommutationInsideMAD:
+; GFX11-DL:       ; %bb.0: ; %entry
+; GFX11-DL-NEXT:    s_clause 0x1
+; GFX11-DL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-DL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_clause 0x1
+; GFX11-DL-NEXT:    global_load_b32 v2, v0, s[4:5]
+; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[6:7]
+; GFX11-DL-NEXT:    global_load_u8 v3, v1, s[0:1]
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DL-NEXT:    v_dot4_u32_u8 v0, v0, v2, v3
+; GFX11-DL-NEXT:    global_store_b8 v1, v0, s[0:1]
+; GFX11-DL-NEXT:    s_nop 0
+; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DL-NEXT:    s_endpgm
                                                       ptr addrspace(1) %src2,
                                                       ptr addrspace(1) nocapture %dst) {
 entry:
@@ -886,7 +905,6 @@ entry:
   ret void
 }
 
-; TODO: Support commutation accross the adds.
 define amdgpu_kernel void @udot4_CommutationAccrossMADs(ptr addrspace(1) %src1,
 ; GFX7-LABEL: udot4_CommutationAccrossMADs:
 ; GFX7:       ; %bb.0: ; %entry
@@ -986,25 +1004,14 @@ define amdgpu_kernel void @udot4_CommutationAccrossMADs(ptr addrspace(1) %src1,
 ; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
 ; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-DL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
-; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
-; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-DL-NEXT:    global_load_ubyte v3, v0, s[2:3]
-; GFX9-DL-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v6, 8, v1
-; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v7, 8, v2
-; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
+; GFX9-DL-NEXT:    global_load_dword v2, v0, s[4:5]
+; GFX9-DL-NEXT:    global_load_dword v3, v0, s[6:7]
+; GFX9-DL-NEXT:    global_load_ubyte v4, v1, s[2:3]
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT:    v_mad_legacy_u16 v3, v7, v6, v3
-; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
-; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v8, 24, v1
-; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v2, v1, v3
-; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v9, 24, v2
-; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v5, v4, v1
-; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v9, v8, v1
-; GFX9-DL-NEXT:    global_store_byte v0, v1, s[2:3]
+; GFX9-DL-NEXT:    v_dot4_u32_u8 v0, v3, v2, v4
+; GFX9-DL-NEXT:    global_store_byte v1, v0, s[2:3]
 ; GFX9-DL-NEXT:    s_endpgm
 ;
 ; GFX10-DL-LABEL: udot4_CommutationAccrossMADs:
@@ -1019,21 +1026,28 @@ define amdgpu_kernel void @udot4_CommutationAccrossMADs(ptr addrspace(1) %src1,
 ; GFX10-DL-NEXT:    global_load_dword v2, v0, s[4:5]
 ; GFX10-DL-NEXT:    global_load_dword v3, v0, s[6:7]
 ; GFX10-DL-NEXT:    global_load_ubyte v4, v1, s[2:3]
-; GFX10-DL-NEXT:    s_waitcnt vmcnt(2)
-; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v0, 8, v2
-; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v5, 8, v3
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT:    v_mad_u16 v0, v5, v0, v4
-; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
-; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
-; GFX10-DL-NEXT:    v_mad_u16 v0, v3, v2, v0
-; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
-; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v3, 24, v3
-; GFX10-DL-NEXT:    v_mad_u16 v0, v5, v4, v0
-; GFX10-DL-NEXT:    v_mad_u16 v0, v3, v2, v0
+; GFX10-DL-NEXT:    v_dot4_u32_u8 v0, v3, v2, v4
 ; GFX10-DL-NEXT:    global_store_byte v1, v0, s[2:3]
 ; GFX10-DL-NEXT:    s_endpgm
+;
+; GFX11-DL-LABEL: udot4_CommutationAccrossMADs:
+; GFX11-DL:       ; %bb.0: ; %entry
+; GFX11-DL-NEXT:    s_clause 0x1
+; GFX11-DL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-DL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_clause 0x1
+; GFX11-DL-NEXT:    global_load_b32 v2, v0, s[4:5]
+; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[6:7]
+; GFX11-DL-NEXT:    global_load_u8 v3, v1, s[0:1]
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DL-NEXT:    v_dot4_u32_u8 v0, v0, v2, v3
+; GFX11-DL-NEXT:    global_store_b8 v1, v0, s[0:1]
+; GFX11-DL-NEXT:    s_nop 0
+; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DL-NEXT:    s_endpgm
                                                         ptr addrspace(1) %src2,
                                                         ptr addrspace(1) nocapture %dst) {
 entry:
@@ -1180,14 +1194,9 @@ define amdgpu_kernel void @udot4_multiuse_mul1(ptr addrspace(1) %src1,
 ; GFX9-DL-NEXT:    v_and_b32_e32 v3, 0xff, v1
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-DL-NEXT:    v_and_b32_e32 v4, 0xff, v2
-; GFX9-DL-NEXT:    v_mul_u32_u24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
-; GFX9-DL-NEXT:    v_mul_u32_u24_sdwa v6, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
-; GFX9-DL-NEXT:    v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
-; GFX9-DL-NEXT:    v_mul_u32_u24_e32 v2, v3, v4
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DL-NEXT:    v_mad_u32_u24 v3, v3, v4, s0
-; GFX9-DL-NEXT:    v_add3_u32 v2, v5, v3, v2
-; GFX9-DL-NEXT:    v_add3_u32 v1, v2, v6, v1
+; GFX9-DL-NEXT:    v_dot4_u32_u8 v1, v1, v2, v3
 ; GFX9-DL-NEXT:    global_store_dword v0, v1, s[2:3]
 ; GFX9-DL-NEXT:    s_endpgm
 ;
@@ -1205,17 +1214,36 @@ define amdgpu_kernel void @udot4_multiuse_mul1(ptr addrspace(1) %src1,
 ; GFX10-DL-NEXT:    v_and_b32_e32 v0, 0xff, v1
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-DL-NEXT:    v_and_b32_e32 v3, 0xff, v2
-; GFX10-DL-NEXT:    v_mul_u32_u24_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
-; GFX10-DL-NEXT:    v_mul_u32_u24_e32 v5, v0, v3
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    v_mad_u32_u24 v0, v0, v3, s2
-; GFX10-DL-NEXT:    v_mul_u32_u24_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
-; GFX10-DL-NEXT:    v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
-; GFX10-DL-NEXT:    v_mov_b32_e32 v2, 0
-; GFX10-DL-NEXT:    v_add3_u32 v0, v4, v0, v5
-; GFX10-DL-NEXT:    v_add3_u32 v0, v0, v3, v1
-; GFX10-DL-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-DL-NEXT:    v_mov_b32_e32 v3, 0
+; GFX10-DL-NEXT:    v_dot4_u32_u8 v0, v1, v2, v0
+; GFX10-DL-NEXT:    global_store_dword v3, v0, s[0:1]
 ; GFX10-DL-NEXT:    s_endpgm
+;
+; GFX11-DL-LABEL: udot4_multiuse_mul1:
+; GFX11-DL:       ; %bb.0: ; %entry
+; GFX11-DL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-DL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_clause 0x1
+; GFX11-DL-NEXT:    global_load_b32 v1, v0, s[4:5]
+; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[6:7]
+; GFX11-DL-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-DL-NEXT:    v_and_b32_e32 v2, 0xff, v1
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DL-NEXT:    v_and_b32_e32 v3, 0xff, v0
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-DL-NEXT:    v_mad_u32_u24 v2, v2, v3, s2
+; GFX11-DL-NEXT:    v_mov_b32_e32 v3, 0
+; GFX11-DL-NEXT:    v_dot4_u32_u8 v0, v1, v0, v2
+; GFX11-DL-NEXT:    global_store_b32 v3, v0, s[0:1]
+; GFX11-DL-NEXT:    s_nop 0
+; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DL-NEXT:    s_endpgm
                                                ptr addrspace(1) %src2,
                                                ptr addrspace(1) nocapture %dst) {
 entry:
@@ -1369,18 +1397,12 @@ define amdgpu_kernel void @udot4_multiuse_add1(ptr addrspace(1) %src1,
 ; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
 ; GFX9-DL-NEXT:    s_load_dword s0, s[2:3], 0x0
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-DL-NEXT:    v_bfe_u32 v4, v1, 8, 8
-; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT:    v_bfe_u32 v5, v2, 8, 8
-; GFX9-DL-NEXT:    v_mul_u32_u24_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
-; GFX9-DL-NEXT:    v_mul_u32_u24_sdwa v6, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
-; GFX9-DL-NEXT:    v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, v4, v5, s0
-; GFX9-DL-NEXT:    v_add_u32_e32 v4, s0, v2
-; GFX9-DL-NEXT:    v_add3_u32 v2, v2, v3, v6
-; GFX9-DL-NEXT:    v_add3_u32 v1, v2, v1, v4
+; GFX9-DL-NEXT:    s_add_i32 s1, s0, s0
+; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-DL-NEXT:    v_mul_u32_u24_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
+; GFX9-DL-NEXT:    v_dot4_u32_u8 v1, v1, v2, s0
+; GFX9-DL-NEXT:    v_add3_u32 v1, s1, v3, v1
 ; GFX9-DL-NEXT:    global_store_dword v0, v1, s[2:3]
 ; GFX9-DL-NEXT:    s_endpgm
 ;
@@ -1394,21 +1416,41 @@ define amdgpu_kernel void @udot4_multiuse_add1(ptr addrspace(1) %src1,
 ; GFX10-DL-NEXT:    global_load_dword v1, v0, s[4:5]
 ; GFX10-DL-NEXT:    global_load_dword v2, v0, s[6:7]
 ; GFX10-DL-NEXT:    s_load_dword s2, s[0:1], 0x0
-; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-DL-NEXT:    v_bfe_u32 v0, v1, 8, 8
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT:    v_bfe_u32 v3, v2, 8, 8
-; GFX10-DL-NEXT:    v_mul_u32_u24_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX10-DL-NEXT:    v_mul_u32_u24_sdwa v0, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    v_mad_u32_u24 v0, v0, v3, s2
-; GFX10-DL-NEXT:    v_mul_u32_u24_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
-; GFX10-DL-NEXT:    v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
-; GFX10-DL-NEXT:    v_add_nc_u32_e32 v2, s2, v0
-; GFX10-DL-NEXT:    v_add3_u32 v0, v0, v4, v3
-; GFX10-DL-NEXT:    v_mov_b32_e32 v3, 0
-; GFX10-DL-NEXT:    v_add3_u32 v0, v0, v1, v2
-; GFX10-DL-NEXT:    global_store_dword v3, v0, s[0:1]
+; GFX10-DL-NEXT:    v_dot4_u32_u8 v1, v1, v2, s2
+; GFX10-DL-NEXT:    s_add_i32 s2, s2, s2
+; GFX10-DL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-DL-NEXT:    v_add3_u32 v0, s2, v0, v1
+; GFX10-DL-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-DL-NEXT:    s_endpgm
+;
+; GFX11-DL-LABEL: udot4_multiuse_add1:
+; GFX11-DL:       ; %bb.0: ; %entry
+; GFX11-DL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-DL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_clause 0x1
+; GFX11-DL-NEXT:    global_load_b32 v1, v0, s[4:5]
+; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[6:7]
+; GFX11-DL-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-DL-NEXT:    v_bfe_u32 v2, v1, 8, 8
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DL-NEXT:    v_bfe_u32 v3, v0, 8, 8
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    v_dot4_u32_u8 v0, v1, v0, s2
+; GFX11-DL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-DL-NEXT:    s_add_i32 s2, s2, s2
+; GFX11-DL-NEXT:    v_mul_u32_u24_e32 v2, v2, v3
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DL-NEXT:    v_add3_u32 v0, s2, v2, v0
+; GFX11-DL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-DL-NEXT:    s_nop 0
+; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DL-NEXT:    s_endpgm
                                                ptr addrspace(1) %src2,
                                                ptr addrspace(1) nocapture %dst) {
 entry:
@@ -1564,7 +1606,7 @@ define amdgpu_kernel void @notdot4_mixedtypes(ptr addrspace(1) %src1,
 ; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
 ; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-DL-NEXT:    s_movk_i32 s0, 0xff
+; GFX9-DL-NEXT:    s_mov_b32 s0, 0xc0c0302
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
 ; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
@@ -1580,13 +1622,10 @@ define amdgpu_kernel void @notdot4_mixedtypes(ptr addrspace(1) %src1,
 ; GFX9-DL-NEXT:    v_bfe_i32 v5, v2, 0, 8
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-DL-NEXT:    v_mad_legacy_u16 v3, v6, v7, v3
-; GFX9-DL-NEXT:    v_and_b32_sdwa v8, v1, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-DL-NEXT:    v_and_b32_sdwa v9, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-DL-NEXT:    v_perm_b32 v2, v2, v2, s0
 ; GFX9-DL-NEXT:    v_mad_legacy_u16 v3, v4, v5, v3
-; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v1, 24, v1
-; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
-; GFX9-DL-NEXT:    v_mad_legacy_u16 v3, v8, v9, v3
-; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v1, v2, v3
+; GFX9-DL-NEXT:    v_perm_b32 v1, v1, v1, s0
+; GFX9-DL-NEXT:    v_dot4_u32_u8 v1, v1, v2, v3
 ; GFX9-DL-NEXT:    global_store_short v0, v1, s[2:3]
 ; GFX9-DL-NEXT:    s_endpgm
 ;
@@ -1596,7 +1635,6 @@ define amdgpu_kernel void @notdot4_mixedtypes(ptr addrspace(1) %src1,
 ; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
 ; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT:    v_mov_b32_e32 v7, 0xff
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    s_clause 0x1
 ; GFX10-DL-NEXT:    global_load_dword v1, v0, s[4:5]
@@ -1608,20 +1646,52 @@ define amdgpu_kernel void @notdot4_mixedtypes(ptr addrspace(1) %src1,
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
 ; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v5, 8, v2
 ; GFX10-DL-NEXT:    v_bfe_i32 v6, v1, 0, 8
-; GFX10-DL-NEXT:    v_bfe_i32 v8, v2, 0, 8
+; GFX10-DL-NEXT:    v_bfe_i32 v7, v2, 0, 8
+; GFX10-DL-NEXT:    v_perm_b32 v2, v2, v2, 0xc0c0302
 ; GFX10-DL-NEXT:    v_and_b32_e32 v4, 0xff, v4
 ; GFX10-DL-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX10-DL-NEXT:    v_perm_b32 v1, v1, v1, 0xc0c0302
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-DL-NEXT:    v_mad_u16 v3, v4, v5, v3
-; GFX10-DL-NEXT:    v_and_b32_sdwa v4, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-DL-NEXT:    v_and_b32_sdwa v5, v2, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v1, 24, v1
-; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
-; GFX10-DL-NEXT:    v_mad_u16 v3, v6, v8, v3
-; GFX10-DL-NEXT:    v_mad_u16 v3, v4, v5, v3
-; GFX10-DL-NEXT:    v_mad_u16 v1, v1, v2, v3
+; GFX10-DL-NEXT:    v_mad_u16 v3, v6, v7, v3
+; GFX10-DL-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX10-DL-NEXT:    v_dot4_u32_u8 v1, v1, v2, v3
 ; GFX10-DL-NEXT:    global_store_short v0, v1, s[2:3]
 ; GFX10-DL-NEXT:    s_endpgm
+;
+; GFX11-DL-LABEL: notdot4_mixedtypes:
+; GFX11-DL:       ; %bb.0: ; %entry
+; GFX11-DL-NEXT:    s_clause 0x1
+; GFX11-DL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_clause 0x1
+; GFX11-DL-NEXT:    global_load_b32 v1, v0, s[4:5]
+; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[6:7]
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v4, 8, v1
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v5, 8, v0
+; GFX11-DL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-DL-NEXT:    v_bfe_i32 v6, v1, 0, 8
+; GFX11-DL-NEXT:    v_bfe_i32 v7, v0, 0, 8
+; GFX11-DL-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-DL-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX11-DL-NEXT:    global_load_u16 v3, v2, s[0:1]
+; GFX11-DL-NEXT:    v_perm_b32 v0, v0, v0, 0xc0c0302
+; GFX11-DL-NEXT:    v_perm_b32 v1, v1, v1, 0xc0c0302
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DL-NEXT:    v_mad_u16 v3, v4, v5, v3
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-DL-NEXT:    v_mad_u16 v3, v6, v7, v3
+; GFX11-DL-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DL-NEXT:    v_dot4_u32_u8 v0, v1, v0, v3
+; GFX11-DL-NEXT:    global_store_b16 v2, v0, s[0:1]
+; GFX11-DL-NEXT:    s_nop 0
+; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DL-NEXT:    s_endpgm
                                               ptr addrspace(1) %src2,
                                               ptr addrspace(1) nocapture %dst) {
 entry:
@@ -1665,7 +1735,7 @@ entry:
   ret void
 }
 
-; TODO: cleanup s_lshr_b32 and support this pattern.
+; TODO: cleanup s_lshr_b32
 define amdgpu_kernel void @udot4_acc32_vecMul(ptr addrspace(1) %src1,
 ; GFX7-LABEL: udot4_acc32_vecMul:
 ; GFX7:       ; %bb.0: ; %entry
@@ -1767,14 +1837,8 @@ define amdgpu_kernel void @udot4_acc32_vecMul(ptr addrspace(1) %src1,
 ; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
 ; GFX9-DL-NEXT:    s_load_dword s0, s[2:3], 0x0
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT:    v_mul_u32_u24_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
-; GFX9-DL-NEXT:    v_mul_u32_u24_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
-; GFX9-DL-NEXT:    v_mul_u32_u24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
-; GFX9-DL-NEXT:    v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
-; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    v_add3_u32 v2, v3, s0, v4
-; GFX9-DL-NEXT:    v_add3_u32 v1, v2, v5, v1
+; GFX9-DL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-DL-NEXT:    v_dot4_u32_u8 v1, v1, v2, s0
 ; GFX9-DL-NEXT:    global_store_dword v0, v1, s[2:3]
 ; GFX9-DL-NEXT:    s_endpgm
 ;
@@ -1787,22 +1851,30 @@ define amdgpu_kernel void @udot4_acc32_vecMul(ptr addrspace(1) %src1,
 ; GFX10-DL-NEXT:    s_clause 0x1
 ; GFX10-DL-NEXT:    global_load_dword v1, v0, s[4:5]
 ; GFX10-DL-NEXT:    global_load_dword v2, v0, s[6:7]
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0xffff
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-DL-NEXT:    s_load_dword s2, s[0:1], 0x0
-; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-DL-NEXT:    v_and_b32_sdwa v3, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT:    v_and_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX10-DL-NEXT:    v_mul_u32_u24_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
-; GFX10-DL-NEXT:    v_mul_u32_u24_e32 v0, v3, v0
-; GFX10-DL-NEXT:    v_mul_u32_u24_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
-; GFX10-DL-NEXT:    v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
-; GFX10-DL-NEXT:    v_mov_b32_e32 v2, 0
-; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    v_add3_u32 v0, v4, s2, v0
-; GFX10-DL-NEXT:    v_add3_u32 v0, v0, v3, v1
-; GFX10-DL-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-DL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-DL-NEXT:    v_dot4_u32_u8 v1, v1, v2, s2
+; GFX10-DL-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-DL-NEXT:    s_endpgm
+;
+; GFX11-DL-LABEL: udot4_acc32_vecMul:
+; GFX11-DL:       ; %bb.0: ; %entry
+; GFX11-DL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-DL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-DL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_clause 0x1
+; GFX11-DL-NEXT:    global_load_b32 v1, v0, s[4:5]
+; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[6:7]
+; GFX11-DL-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-DL-NEXT:    v_dot4_u32_u8 v0, v1, v0, s2
+; GFX11-DL-NEXT:    global_store_b32 v2, v0, s[0:1]
+; GFX11-DL-NEXT:    s_nop 0
+; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DL-NEXT:    s_endpgm
                                               ptr addrspace(1) %src2,
                                               ptr addrspace(1) nocapture %dst) {
 entry:
@@ -2016,6 +2088,51 @@ define amdgpu_kernel void @udot4_acc16_vecMul(ptr addrspace(1) %src1,
 ; GFX10-DL-NEXT:    v_add_nc_u16 v1, v1, v3
 ; GFX10-DL-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX10-DL-NEXT:    s_endpgm
+;
+; GFX11-DL-LABEL: udot4_acc16_vecMul:
+; GFX11-DL:       ; %bb.0: ; %entry
+; GFX11-DL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-DL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_clause 0x1
+; GFX11-DL-NEXT:    global_load_b32 v1, v0, s[4:5]
+; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[6:7]
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-DL-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_and_b32 v7, 0xff, v1
+; GFX11-DL-NEXT:    global_load_u16 v3, v2, s[0:1]
+; GFX11-DL-NEXT:    v_lshrrev_b16 v4, 8, v1
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-DL-NEXT:    v_lshrrev_b16 v5, 8, v0
+; GFX11-DL-NEXT:    v_and_b32_e32 v6, 0xff, v0
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v8, 16, v1
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v9, 16, v0
+; GFX11-DL-NEXT:    v_perm_b32 v4, v4, v7, 0x5040100
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v1, 24, v1
+; GFX11-DL-NEXT:    v_perm_b32 v5, v5, v6, 0x5040100
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
+; GFX11-DL-NEXT:    v_and_b32_e32 v6, 0xff, v9
+; GFX11-DL-NEXT:    v_and_b32_e32 v7, 0xff, v8
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-DL-NEXT:    v_pk_mul_lo_u16 v4, v4, v5
+; GFX11-DL-NEXT:    v_perm_b32 v0, v0, v6, 0x5040100
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-DL-NEXT:    v_perm_b32 v1, v1, v7, 0x5040100
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-DL-NEXT:    v_pk_mul_lo_u16 v0, v1, v0
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DL-NEXT:    v_add_nc_u16 v3, v4, v3
+; GFX11-DL-NEXT:    v_add_nc_u16 v1, v3, v5
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-DL-NEXT:    v_add_nc_u16 v0, v1, v0
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DL-NEXT:    v_add_nc_u16 v0, v0, v3
+; GFX11-DL-NEXT:    global_store_b16 v2, v0, s[0:1]
+; GFX11-DL-NEXT:    s_nop 0
+; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DL-NEXT:    s_endpgm
                                               ptr addrspace(1) %src2,
                                               ptr addrspace(1) nocapture %dst) {
 entry:
@@ -2208,6 +2325,53 @@ define amdgpu_kernel void @udot4_acc8_vecMul(ptr addrspace(1) %src1,
 ; GFX10-DL-NEXT:    v_add_nc_u16 v1, v1, v2
 ; GFX10-DL-NEXT:    global_store_byte v0, v1, s[0:1]
 ; GFX10-DL-NEXT:    s_endpgm
+;
+; GFX11-DL-LABEL: udot4_acc8_vecMul:
+; GFX11-DL:       ; %bb.0: ; %entry
+; GFX11-DL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-DL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-DL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_clause 0x1
+; GFX11-DL-NEXT:    global_load_b32 v1, v0, s[4:5]
+; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[6:7]
+; GFX11-DL-NEXT:    global_load_u8 v3, v2, s[0:1]
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v5, 24, v1
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v6, 24, v0
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v7, 16, v0
+; GFX11-DL-NEXT:    v_lshrrev_b16 v8, 8, v1
+; GFX11-DL-NEXT:    v_lshrrev_b16 v9, 8, v0
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DL-NEXT:    v_mad_u16 v0, v1, v0, v3
+; GFX11-DL-NEXT:    v_mul_lo_u16 v5, v5, v6
+; GFX11-DL-NEXT:    v_mul_lo_u16 v6, v4, v7
+; GFX11-DL-NEXT:    v_mul_lo_u16 v8, v8, v9
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-DL-NEXT:    v_lshlrev_b16 v5, 8, v5
+; GFX11-DL-NEXT:    v_and_b32_e32 v6, 0xff, v6
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-DL-NEXT:    v_lshlrev_b16 v8, 8, v8
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v1, 8, v5
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-DL-NEXT:    v_or_b32_e32 v6, v6, v5
+; GFX11-DL-NEXT:    v_and_b32_e32 v8, 0xffff, v8
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX11-DL-NEXT:    v_or_b32_e32 v6, v8, v6
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v6, 8, v6
+; GFX11-DL-NEXT:    v_add_nc_u16 v0, v0, v6
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-DL-NEXT:    v_mad_u16 v0, v4, v7, v0
+; GFX11-DL-NEXT:    v_add_nc_u16 v0, v0, v1
+; GFX11-DL-NEXT:    global_store_b8 v2, v0, s[0:1]
+; GFX11-DL-NEXT:    s_nop 0
+; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DL-NEXT:    s_endpgm
                                              ptr addrspace(1) %src2,
                                              ptr addrspace(1) nocapture %dst) {
 entry:
@@ -2233,4 +2397,2229 @@ entry:
   ret void
 }
 
+define amdgpu_kernel void @idot4_acc32_2ele(ptr addrspace(1) %src1,
+; GFX7-LABEL: idot4_acc32_2ele:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    s_mov_b32 s10, 0
+; GFX7-NEXT:    s_mov_b32 s11, s3
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX7-NEXT:    v_mov_b32_e32 v1, 0
+; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xff, v2
+; GFX7-NEXT:    v_bfe_u32 v2, v2, 8, 8
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xff, v0
+; GFX7-NEXT:    v_bfe_u32 v0, v0, 8, 8
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mad_u32_u24 v1, v1, v3, s4
+; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, v1
+; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: idot4_acc32_2ele:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v0, v[0:1]
+; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX8-NEXT:    s_waitcnt vmcnt(1)
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xff, v3
+; GFX8-NEXT:    v_bfe_u32 v3, v3, 8, 8
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xff, v0
+; GFX8-NEXT:    v_bfe_u32 v0, v0, 8, 8
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mad_u32_u24 v1, v1, v2, s2
+; GFX8-NEXT:    v_mad_u32_u24 v2, v3, v0, v1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    flat_store_dword v[0:1], v2
+; GFX8-NEXT:    s_endpgm
+;
+; GFX9-NODL-LABEL: idot4_acc32_2ele:
+; GFX9-NODL:       ; %bb.0: ; %entry
+; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NODL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[4:5]
+; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[6:7]
+; GFX9-NODL-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
+; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NODL-NEXT:    v_add3_u32 v1, v3, s0, v1
+; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX9-NODL-NEXT:    s_endpgm
+;
+; GFX9-DL-LABEL: idot4_acc32_2ele:
+; GFX9-DL:       ; %bb.0: ; %entry
+; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-DL-NEXT:    s_mov_b32 s1, 0xc0c0100
+; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-DL-NEXT:    global_load_dword v2, v0, s[4:5]
+; GFX9-DL-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-DL-NEXT:    v_perm_b32 v1, v1, v1, s1
+; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-DL-NEXT:    v_perm_b32 v2, v2, v2, s1
+; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT:    v_dot4_u32_u8 v1, v2, v1, s0
+; GFX9-DL-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX9-DL-NEXT:    s_endpgm
+;
+; GFX10-DL-LABEL: idot4_acc32_2ele:
+; GFX10-DL:       ; %bb.0: ; %entry
+; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_clause 0x1
+; GFX10-DL-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX10-DL-NEXT:    global_load_dword v2, v0, s[4:5]
+; GFX10-DL-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-DL-NEXT:    v_perm_b32 v0, v1, v1, 0xc0c0100
+; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DL-NEXT:    v_perm_b32 v1, v2, v2, 0xc0c0100
+; GFX10-DL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    v_dot4_u32_u8 v0, v1, v0, s2
+; GFX10-DL-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-DL-NEXT:    s_endpgm
+;
+; GFX11-DL-LABEL: idot4_acc32_2ele:
+; GFX11-DL:       ; %bb.0: ; %entry
+; GFX11-DL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-DL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-DL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_clause 0x1
+; GFX11-DL-NEXT:    global_load_b32 v1, v0, s[6:7]
+; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[4:5]
+; GFX11-DL-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-DL-NEXT:    v_perm_b32 v1, v1, v1, 0xc0c0100
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DL-NEXT:    v_perm_b32 v0, v0, v0, 0xc0c0100
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DL-NEXT:    v_dot4_u32_u8 v0, v0, v1, s2
+; GFX11-DL-NEXT:    global_store_b32 v2, v0, s[0:1]
+; GFX11-DL-NEXT:    s_nop 0
+; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DL-NEXT:    s_endpgm
+                                       ptr addrspace(1) %src2,
+                                       ptr addrspace(1) nocapture %dst) {
+entry:
+  %idx = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
+  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
+  %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
+  %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
+
+  %v1e0 = extractelement <4 x i8> %vec1, i64 0
+  %cv1e0 = zext i8 %v1e0 to i32
+  %v2e0 = extractelement <4 x i8> %vec2, i64 0
+  %cv2e0 = zext i8 %v2e0 to i32
+  %mul1 = mul nuw nsw i32 %cv1e0, %cv2e0
+
+  %v1e1 = extractelement <4 x i8> %vec1, i64 1
+  %cv1e1 = zext i8 %v1e1 to i32
+  %v2e1 = extractelement <4 x i8> %vec2, i64 1
+  %cv2e1 = zext i8 %v2e1 to i32
+  %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1
+
+  %acc = load i32, ptr addrspace(1) %dst, align 4
+  %add1 = add i32 %mul1, %acc
+  %add2 = add i32 %add1, %mul2
+  store i32 %add2, ptr addrspace(1) %dst, align 4
+  ret void
+}
+
+define amdgpu_kernel void @idot4_acc32_3ele(ptr addrspace(1) %src1,
+; GFX7-LABEL: idot4_acc32_3ele:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    s_mov_b32 s10, 0
+; GFX7-NEXT:    s_mov_b32 s11, s3
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX7-NEXT:    v_mov_b32_e32 v1, 0
+; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xff, v2
+; GFX7-NEXT:    v_bfe_u32 v3, v2, 8, 8
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xff, v0
+; GFX7-NEXT:    v_bfe_u32 v5, v0, 8, 8
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mad_u32_u24 v1, v1, v4, s4
+; GFX7-NEXT:    v_bfe_u32 v2, v2, 16, 8
+; GFX7-NEXT:    v_bfe_u32 v0, v0, 16, 8
+; GFX7-NEXT:    v_mad_u32_u24 v1, v3, v5, v1
+; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, v1
+; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: idot4_acc32_3ele:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v0, v[0:1]
+; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX8-NEXT:    s_waitcnt vmcnt(1)
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xff, v3
+; GFX8-NEXT:    v_bfe_u32 v4, v3, 8, 8
+; GFX8-NEXT:    v_bfe_u32 v3, v3, 16, 8
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xff, v0
+; GFX8-NEXT:    v_bfe_u32 v5, v0, 8, 8
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mad_u32_u24 v1, v1, v2, s2
+; GFX8-NEXT:    v_bfe_u32 v0, v0, 16, 8
+; GFX8-NEXT:    v_mad_u32_u24 v1, v4, v5, v1
+; GFX8-NEXT:    v_mad_u32_u24 v2, v3, v0, v1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    flat_store_dword v[0:1], v2
+; GFX8-NEXT:    s_endpgm
+;
+; GFX9-NODL-LABEL: idot4_acc32_3ele:
+; GFX9-NODL:       ; %bb.0: ; %entry
+; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NODL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[4:5]
+; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[6:7]
+; GFX9-NODL-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NODL-NEXT:    v_and_b32_e32 v3, 0xff, v1
+; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NODL-NEXT:    v_and_b32_e32 v4, 0xff, v2
+; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
+; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
+; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, v3, v4, s0
+; GFX9-NODL-NEXT:    v_add3_u32 v1, v2, v5, v1
+; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX9-NODL-NEXT:    s_endpgm
+;
+; GFX9-DL-LABEL: idot4_acc32_3ele:
+; GFX9-DL:       ; %bb.0: ; %entry
+; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-DL-NEXT:    s_mov_b32 s1, 0xc020100
+; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-DL-NEXT:    global_load_dword v2, v0, s[4:5]
+; GFX9-DL-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-DL-NEXT:    v_perm_b32 v1, v1, v1, s1
+; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-DL-NEXT:    v_perm_b32 v2, v2, v2, s1
+; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT:    v_dot4_u32_u8 v1, v2, v1, s0
+; GFX9-DL-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX9-DL-NEXT:    s_endpgm
+;
+; GFX10-DL-LABEL: idot4_acc32_3ele:
+; GFX10-DL:       ; %bb.0: ; %entry
+; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_clause 0x1
+; GFX10-DL-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX10-DL-NEXT:    global_load_dword v2, v0, s[4:5]
+; GFX10-DL-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-DL-NEXT:    v_perm_b32 v0, v1, v1, 0xc020100
+; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DL-NEXT:    v_perm_b32 v1, v2, v2, 0xc020100
+; GFX10-DL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    v_dot4_u32_u8 v0, v1, v0, s2
+; GFX10-DL-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-DL-NEXT:    s_endpgm
+;
+; GFX11-DL-LABEL: idot4_acc32_3ele:
+; GFX11-DL:       ; %bb.0: ; %entry
+; GFX11-DL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-DL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-DL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_clause 0x1
+; GFX11-DL-NEXT:    global_load_b32 v1, v0, s[6:7]
+; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[4:5]
+; GFX11-DL-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-DL-NEXT:    v_perm_b32 v1, v1, v1, 0xc020100
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DL-NEXT:    v_perm_b32 v0, v0, v0, 0xc020100
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DL-NEXT:    v_dot4_u32_u8 v0, v0, v1, s2
+; GFX11-DL-NEXT:    global_store_b32 v2, v0, s[0:1]
+; GFX11-DL-NEXT:    s_nop 0
+; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DL-NEXT:    s_endpgm
+                                       ptr addrspace(1) %src2,
+                                       ptr addrspace(1) nocapture %dst) {
+entry:
+  %idx = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
+  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
+  %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
+  %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
+
+  %v1e0 = extractelement <4 x i8> %vec1, i64 0
+  %cv1e0 = zext i8 %v1e0 to i32
+  %v2e0 = extractelement <4 x i8> %vec2, i64 0
+  %cv2e0 = zext i8 %v2e0 to i32
+  %mul1 = mul nuw nsw i32 %cv1e0, %cv2e0
+
+  %v1e1 = extractelement <4 x i8> %vec1, i64 1
+  %cv1e1 = zext i8 %v1e1 to i32
+  %v2e1 = extractelement <4 x i8> %vec2, i64 1
+  %cv2e1 = zext i8 %v2e1 to i32
+  %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1
+
+  %v1e2 = extractelement <4 x i8> %vec1, i64 2
+  %cv1e2 = zext i8 %v1e2 to i32
+  %v2e2 = extractelement <4 x i8> %vec2, i64 2
+  %cv2e2 = zext i8 %v2e2 to i32
+  %mul3 = mul nuw nsw i32 %cv1e2, %cv2e2
+
+  %acc = load i32, ptr addrspace(1) %dst, align 4
+  %add1 = add i32 %mul1, %acc
+  %add2 = add i32 %add1, %mul2
+  %add3 = add i32 %add2, %mul3
+  store i32 %add3, ptr addrspace(1) %dst, align 4
+  ret void
+}
+
+define amdgpu_kernel void @idot4_acc32_3ele_permuted(ptr addrspace(1) %src1,
+; GFX7-LABEL: idot4_acc32_3ele_permuted:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    s_mov_b32 s10, 0
+; GFX7-NEXT:    s_mov_b32 s11, s3
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX7-NEXT:    v_mov_b32_e32 v1, 0
+; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 24, v2
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xff, v2
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 24, v0
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xff, v0
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mad_u32_u24 v1, v1, v4, s4
+; GFX7-NEXT:    v_bfe_u32 v2, v2, 16, 8
+; GFX7-NEXT:    v_bfe_u32 v0, v0, 16, 8
+; GFX7-NEXT:    v_mad_u32_u24 v1, v3, v5, v1
+; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, v1
+; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: idot4_acc32_3ele_permuted:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v0, v[0:1]
+; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX8-NEXT:    s_waitcnt vmcnt(1)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 24, v3
+; GFX8-NEXT:    v_and_b32_e32 v4, 0xff, v3
+; GFX8-NEXT:    v_bfe_u32 v3, v3, 16, 8
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 24, v0
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xff, v0
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mad_u32_u24 v1, v1, v2, s2
+; GFX8-NEXT:    v_bfe_u32 v0, v0, 16, 8
+; GFX8-NEXT:    v_mad_u32_u24 v1, v4, v5, v1
+; GFX8-NEXT:    v_mad_u32_u24 v2, v3, v0, v1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    flat_store_dword v[0:1], v2
+; GFX8-NEXT:    s_endpgm
+;
+; GFX9-NODL-LABEL: idot4_acc32_3ele_permuted:
+; GFX9-NODL:       ; %bb.0: ; %entry
+; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NODL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[4:5]
+; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[6:7]
+; GFX9-NODL-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v3, 24, v1
+; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v4, 24, v2
+; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
+; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, v3, v4, s0
+; GFX9-NODL-NEXT:    v_add3_u32 v1, v2, v5, v1
+; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX9-NODL-NEXT:    s_endpgm
+;
+; GFX9-DL-LABEL: idot4_acc32_3ele_permuted:
+; GFX9-DL:       ; %bb.0: ; %entry
+; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-DL-NEXT:    s_mov_b32 s1, 0xc020003
+; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-DL-NEXT:    global_load_dword v2, v0, s[4:5]
+; GFX9-DL-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-DL-NEXT:    v_perm_b32 v1, v1, v1, s1
+; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-DL-NEXT:    v_perm_b32 v2, v2, v2, s1
+; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT:    v_dot4_u32_u8 v1, v2, v1, s0
+; GFX9-DL-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX9-DL-NEXT:    s_endpgm
+;
+; GFX10-DL-LABEL: idot4_acc32_3ele_permuted:
+; GFX10-DL:       ; %bb.0: ; %entry
+; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_clause 0x1
+; GFX10-DL-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX10-DL-NEXT:    global_load_dword v2, v0, s[4:5]
+; GFX10-DL-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-DL-NEXT:    v_perm_b32 v0, v1, v1, 0xc020003
+; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DL-NEXT:    v_perm_b32 v1, v2, v2, 0xc020003
+; GFX10-DL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    v_dot4_u32_u8 v0, v1, v0, s2
+; GFX10-DL-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-DL-NEXT:    s_endpgm
+;
+; GFX11-DL-LABEL: idot4_acc32_3ele_permuted:
+; GFX11-DL:       ; %bb.0: ; %entry
+; GFX11-DL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-DL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-DL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_clause 0x1
+; GFX11-DL-NEXT:    global_load_b32 v1, v0, s[6:7]
+; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[4:5]
+; GFX11-DL-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-DL-NEXT:    v_perm_b32 v1, v1, v1, 0xc020003
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DL-NEXT:    v_perm_b32 v0, v0, v0, 0xc020003
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DL-NEXT:    v_dot4_u32_u8 v0, v0, v1, s2
+; GFX11-DL-NEXT:    global_store_b32 v2, v0, s[0:1]
+; GFX11-DL-NEXT:    s_nop 0
+; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DL-NEXT:    s_endpgm
+                                       ptr addrspace(1) %src2,
+                                       ptr addrspace(1) nocapture %dst) {
+entry:
+  %idx = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
+  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
+  %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
+  %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
+
+  %v1e0 = extractelement <4 x i8> %vec1, i64 3
+  %cv1e0 = zext i8 %v1e0 to i32
+  %v2e0 = extractelement <4 x i8> %vec2, i64 3
+  %cv2e0 = zext i8 %v2e0 to i32
+  %mul1 = mul nuw nsw i32 %cv1e0, %cv2e0
+
+  %v1e1 = extractelement <4 x i8> %vec1, i64 0
+  %cv1e1 = zext i8 %v1e1 to i32
+  %v2e1 = extractelement <4 x i8> %vec2, i64 0
+  %cv2e1 = zext i8 %v2e1 to i32
+  %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1
+
+  %v1e2 = extractelement <4 x i8> %vec1, i64 2
+  %cv1e2 = zext i8 %v1e2 to i32
+  %v2e2 = extractelement <4 x i8> %vec2, i64 2
+  %cv2e2 = zext i8 %v2e2 to i32
+  %mul3 = mul nuw nsw i32 %cv1e2, %cv2e2
+
+  %acc = load i32, ptr addrspace(1) %dst, align 4
+  %add1 = add i32 %mul1, %acc
+  %add2 = add i32 %add1, %mul2
+  %add3 = add i32 %add2, %mul3
+  store i32 %add3, ptr addrspace(1) %dst, align 4
+  ret void
+}
+
+
+define amdgpu_kernel void @idot4_acc32_opt(ptr addrspace(1) %src1,
+; GFX7-LABEL: idot4_acc32_opt:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    s_mov_b32 s10, 0
+; GFX7-NEXT:    s_mov_b32 s11, s3
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX7-NEXT:    v_mov_b32_e32 v1, 0
+; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_bfe_u32 v3, v2, 8, 8
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xff, v2
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_bfe_u32 v6, v0, 8, 8
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xff, v0
+; GFX7-NEXT:    v_mul_u32_u24_e32 v3, v3, v6
+; GFX7-NEXT:    v_bfe_u32 v4, v2, 16, 8
+; GFX7-NEXT:    v_bfe_u32 v7, v0, 16, 8
+; GFX7-NEXT:    v_mad_u32_u24 v1, v1, v5, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
+; GFX7-NEXT:    v_mad_u32_u24 v1, v4, v7, v1
+; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, v1
+; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: idot4_acc32_opt:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v2, v[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    s_waitcnt vmcnt(1)
+; GFX8-NEXT:    v_and_b32_e32 v4, 0xff, v3
+; GFX8-NEXT:    v_bfe_u32 v7, v3, 16, 8
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xff, v2
+; GFX8-NEXT:    v_mul_u32_u24_sdwa v6, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
+; GFX8-NEXT:    v_bfe_u32 v8, v2, 16, 8
+; GFX8-NEXT:    v_mad_u32_u24 v4, v4, v5, v6
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 24, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
+; GFX8-NEXT:    v_mad_u32_u24 v4, v7, v8, v4
+; GFX8-NEXT:    v_mad_u32_u24 v2, v3, v2, v4
+; GFX8-NEXT:    flat_store_dword v[0:1], v2
+; GFX8-NEXT:    s_endpgm
+;
+; GFX9-NODL-LABEL: idot4_acc32_opt:
+; GFX9-NODL:       ; %bb.0: ; %entry
+; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NODL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[4:5]
+; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[6:7]
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NODL-NEXT:    v_and_b32_e32 v3, 0xff, v1
+; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NODL-NEXT:    v_and_b32_e32 v4, 0xff, v2
+; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
+; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v6, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
+; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
+; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, v3, v4, v5
+; GFX9-NODL-NEXT:    v_add3_u32 v1, v2, v6, v1
+; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX9-NODL-NEXT:    s_endpgm
+;
+; GFX9-DL-LABEL: idot4_acc32_opt:
+; GFX9-DL:       ; %bb.0: ; %entry
+; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
+; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-DL-NEXT:    v_dot4_u32_u8 v1, v1, v2, 0
+; GFX9-DL-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX9-DL-NEXT:    s_endpgm
+;
+; GFX10-DL-LABEL: idot4_acc32_opt:
+; GFX10-DL:       ; %bb.0: ; %entry
+; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_clause 0x1
+; GFX10-DL-NEXT:    global_load_dword v1, v0, s[4:5]
+; GFX10-DL-NEXT:    global_load_dword v2, v0, s[6:7]
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DL-NEXT:    v_dot4_u32_u8 v1, v1, v2, 0
+; GFX10-DL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-DL-NEXT:    s_endpgm
+;
+; GFX11-DL-LABEL: idot4_acc32_opt:
+; GFX11-DL:       ; %bb.0: ; %entry
+; GFX11-DL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-DL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-DL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_clause 0x1
+; GFX11-DL-NEXT:    global_load_b32 v1, v0, s[4:5]
+; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[6:7]
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DL-NEXT:    v_dot4_u32_u8 v0, v1, v0, 0
+; GFX11-DL-NEXT:    global_store_b32 v2, v0, s[0:1]
+; GFX11-DL-NEXT:    s_nop 0
+; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DL-NEXT:    s_endpgm
+                                       ptr addrspace(1) %src2,
+                                       ptr addrspace(1) nocapture %dst) {
+entry:
+  %idx = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
+  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
+  %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
+  %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
+
+  %v1e0 = extractelement <4 x i8> %vec1, i64 0
+  %cv1e0 = zext i8 %v1e0 to i32
+  %v2e0 = extractelement <4 x i8> %vec2, i64 0
+  %cv2e0 = zext i8 %v2e0 to i32
+  %mul1 = mul nuw nsw i32 %cv1e0, %cv2e0
+
+  %v1e1 = extractelement <4 x i8> %vec1, i64 1
+  %cv1e1 = zext i8 %v1e1 to i32
+  %v2e1 = extractelement <4 x i8> %vec2, i64 1
+  %cv2e1 = zext i8 %v2e1 to i32
+  %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1
+
+  %v1e2 = extractelement <4 x i8> %vec1, i64 2
+  %cv1e2 = zext i8 %v1e2 to i32
+  %v2e2 = extractelement <4 x i8> %vec2, i64 2
+  %cv2e2 = zext i8 %v2e2 to i32
+  %mul3 = mul nuw nsw i32 %cv1e2, %cv2e2
+
+  %v1e3 = extractelement <4 x i8> %vec1, i64 3
+  %cv1e3 = zext i8 %v1e3 to i32
+  %v2e3 = extractelement <4 x i8> %vec2, i64 3
+  %cv2e3 = zext i8 %v2e3 to i32
+  %mul4 = mul nuw nsw i32 %cv1e3, %cv2e3
+
+  %add2 = add i32 %mul1, %mul2
+  %add3 = add i32 %add2, %mul3
+  %add4 = add i32 %add3, %mul4
+  store i32 %add4, ptr addrspace(1) %dst, align 4
+  ret void
+}
+
+define amdgpu_kernel void @udot4_acc32_3src(ptr addrspace(1) %src1,
+; GFX7-LABEL: udot4_acc32_3src:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; GFX7-NEXT:    s_mov_b32 s11, 0xf000
+; GFX7-NEXT:    s_mov_b32 s14, 0
+; GFX7-NEXT:    s_mov_b32 s15, s11
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX7-NEXT:    v_mov_b32_e32 v1, 0
+; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[12:15], 0 addr64
+; GFX7-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64
+; GFX7-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[12:15], 0 addr64
+; GFX7-NEXT:    s_load_dword s0, s[6:7], 0x0
+; GFX7-NEXT:    s_mov_b32 s10, -1
+; GFX7-NEXT:    s_mov_b32 s8, s6
+; GFX7-NEXT:    s_mov_b32 s9, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(2)
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xff, v2
+; GFX7-NEXT:    v_bfe_u32 v4, v2, 8, 8
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_bfe_u32 v3, v3, 8, 8
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mad_u32_u24 v1, v1, v1, s0
+; GFX7-NEXT:    v_bfe_u32 v5, v2, 16, 8
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_bfe_u32 v6, v0, 16, 8
+; GFX7-NEXT:    v_mad_u32_u24 v1, v4, v3, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
+; GFX7-NEXT:    v_mad_u32_u24 v1, v5, v6, v1
+; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, v1
+; GFX7-NEXT:    buffer_store_dword v0, off, s[8:11], 0
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: udot4_acc32_3src:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v1, s3
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v0, v[0:1]
+; GFX8-NEXT:    s_load_dword s0, s[6:7], 0x0
+; GFX8-NEXT:    s_waitcnt vmcnt(2)
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xff, v3
+; GFX8-NEXT:    v_bfe_u32 v2, v3, 8, 8
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mad_u32_u24 v1, v1, v1, s0
+; GFX8-NEXT:    v_bfe_u32 v5, v3, 16, 8
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 24, v3
+; GFX8-NEXT:    s_waitcnt vmcnt(1)
+; GFX8-NEXT:    v_bfe_u32 v4, v4, 8, 8
+; GFX8-NEXT:    v_mad_u32_u24 v1, v2, v4, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_bfe_u32 v6, v0, 16, 8
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
+; GFX8-NEXT:    v_mad_u32_u24 v1, v5, v6, v1
+; GFX8-NEXT:    v_mad_u32_u24 v2, v3, v0, v1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    flat_store_dword v[0:1], v2
+; GFX8-NEXT:    s_endpgm
+;
+; GFX9-NODL-LABEL: udot4_acc32_3src:
+; GFX9-NODL:       ; %bb.0: ; %entry
+; GFX9-NODL-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[2:3]
+; GFX9-NODL-NEXT:    global_load_dword v3, v0, s[4:5]
+; GFX9-NODL-NEXT:    s_load_dword s0, s[6:7], 0x0
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NODL-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v4, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
+; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v5, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
+; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
+; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NODL-NEXT:    v_add3_u32 v2, v4, s0, v2
+; GFX9-NODL-NEXT:    v_add3_u32 v1, v2, v5, v1
+; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX9-NODL-NEXT:    s_endpgm
+;
+; GFX9-DL-LABEL: udot4_acc32_3src:
+; GFX9-DL:       ; %bb.0: ; %entry
+; GFX9-DL-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-DL-NEXT:    global_load_dword v2, v0, s[4:5]
+; GFX9-DL-NEXT:    global_load_dword v3, v0, s[0:1]
+; GFX9-DL-NEXT:    s_load_dword s1, s[6:7], 0x0
+; GFX9-DL-NEXT:    s_mov_b32 s0, 0x706010c
+; GFX9-DL-NEXT:    s_mov_b32 s2, 0xc0c0c00
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-DL-NEXT:    v_perm_b32 v1, v2, v1, s0
+; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-DL-NEXT:    v_perm_b32 v2, v3, v3, s2
+; GFX9-DL-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT:    v_dot4_u32_u8 v1, v3, v1, s1
+; GFX9-DL-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX9-DL-NEXT:    s_endpgm
+;
+; GFX10-DL-LABEL: udot4_acc32_3src:
+; GFX10-DL:       ; %bb.0: ; %entry
+; GFX10-DL-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_clause 0x2
+; GFX10-DL-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX10-DL-NEXT:    global_load_dword v2, v0, s[4:5]
+; GFX10-DL-NEXT:    global_load_dword v3, v0, s[0:1]
+; GFX10-DL-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-DL-NEXT:    s_load_dword s0, s[6:7], 0x0
+; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-DL-NEXT:    v_perm_b32 v0, v2, v1, 0x706010c
+; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DL-NEXT:    v_perm_b32 v1, v3, v3, 0xc0c0c00
+; GFX10-DL-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    v_dot4_u32_u8 v0, v3, v0, s0
+; GFX10-DL-NEXT:    global_store_dword v1, v0, s[6:7]
+; GFX10-DL-NEXT:    s_endpgm
+;
+; GFX11-DL-LABEL: udot4_acc32_3src:
+; GFX11-DL:       ; %bb.0: ; %entry
+; GFX11-DL-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_clause 0x2
+; GFX11-DL-NEXT:    global_load_b32 v1, v0, s[2:3]
+; GFX11-DL-NEXT:    global_load_b32 v2, v0, s[4:5]
+; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[0:1]
+; GFX11-DL-NEXT:    s_load_b32 s0, s[6:7], 0x0
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-DL-NEXT:    v_perm_b32 v1, v2, v1, 0x706010c
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DL-NEXT:    v_perm_b32 v2, v0, v0, 0xc0c0c00
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-DL-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX11-DL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    v_dot4_u32_u8 v0, v0, v1, s0
+; GFX11-DL-NEXT:    global_store_b32 v2, v0, s[6:7]
+; GFX11-DL-NEXT:    s_nop 0
+; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DL-NEXT:    s_endpgm
+                                       ptr addrspace(1) %src2,
+                                       ptr addrspace(1) %src3,
+                                       ptr addrspace(1) nocapture %dst) {
+entry:
+  %idx = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
+  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
+  %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
+  %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
+  %gep3 = getelementptr <4 x i8>, ptr addrspace(1) %src3, i32 %idx
+  %vec3 = load <4 x i8>, ptr addrspace(1) %gep3
+
+  %v1e0 = extractelement <4 x i8> %vec1, i64 0
+  %cv1e0 = zext i8 %v1e0 to i32
+  %mul1 = mul nuw nsw i32 %cv1e0, %cv1e0
+
+  %v1e1 = extractelement <4 x i8> %vec1, i64 1
+  %cv1e1 = zext i8 %v1e1 to i32
+  %v2e1 = extractelement <4 x i8> %vec2, i64 1
+  %cv2e1 = zext i8 %v2e1 to i32
+  %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1
+
+  %v1e2 = extractelement <4 x i8> %vec1, i64 2
+  %cv1e2 = zext i8 %v1e2 to i32
+  %v3e2 = extractelement <4 x i8> %vec3, i64 2
+  %cv3e2 = zext i8 %v3e2 to i32
+  %mul3 = mul nuw nsw i32 %cv1e2, %cv3e2
+
+  %v1e3 = extractelement <4 x i8> %vec1, i64 3
+  %cv1e3 = zext i8 %v1e3 to i32
+  %v3e3 = extractelement <4 x i8> %vec3, i64 3
+  %cv3e3 = zext i8 %v3e3 to i32
+  %mul4 = mul nuw nsw i32 %cv1e3, %cv3e3
+
+  %acc = load i32, ptr addrspace(1) %dst, align 4
+  %mad1 = add i32 %mul1, %acc
+  %mad2 = add i32 %mad1, %mul2
+  %mad3 = add i32 %mad2, %mul3
+  %mad4 = add i32 %mad3, %mul4
+
+  store i32 %mad4, ptr addrspace(1) %dst, align 4
+  ret void
+}
+
+define amdgpu_kernel void @udot4_acc32_3src_3ele(ptr addrspace(1) %src1,
+; GFX7-LABEL: udot4_acc32_3src_3ele:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; GFX7-NEXT:    s_mov_b32 s11, 0xf000
+; GFX7-NEXT:    s_mov_b32 s14, 0
+; GFX7-NEXT:    s_mov_b32 s15, s11
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX7-NEXT:    v_mov_b32_e32 v1, 0
+; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[12:15], 0 addr64
+; GFX7-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64
+; GFX7-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[12:15], 0 addr64
+; GFX7-NEXT:    s_load_dword s0, s[6:7], 0x0
+; GFX7-NEXT:    s_mov_b32 s10, -1
+; GFX7-NEXT:    s_mov_b32 s8, s6
+; GFX7-NEXT:    s_mov_b32 s9, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(2)
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xff, v2
+; GFX7-NEXT:    v_bfe_u32 v4, v2, 8, 8
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_bfe_u32 v3, v3, 8, 8
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mad_u32_u24 v1, v1, v1, s0
+; GFX7-NEXT:    v_bfe_u32 v2, v2, 16, 8
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_bfe_u32 v0, v0, 16, 8
+; GFX7-NEXT:    v_mad_u32_u24 v1, v4, v3, v1
+; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, v1
+; GFX7-NEXT:    buffer_store_dword v0, off, s[8:11], 0
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: udot4_acc32_3src_3ele:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v1, s3
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v0, v[0:1]
+; GFX8-NEXT:    s_load_dword s0, s[6:7], 0x0
+; GFX8-NEXT:    s_waitcnt vmcnt(2)
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xff, v3
+; GFX8-NEXT:    v_bfe_u32 v2, v3, 8, 8
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mad_u32_u24 v1, v1, v1, s0
+; GFX8-NEXT:    v_bfe_u32 v3, v3, 16, 8
+; GFX8-NEXT:    s_waitcnt vmcnt(1)
+; GFX8-NEXT:    v_bfe_u32 v4, v4, 8, 8
+; GFX8-NEXT:    v_mad_u32_u24 v1, v2, v4, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_bfe_u32 v0, v0, 16, 8
+; GFX8-NEXT:    v_mad_u32_u24 v2, v3, v0, v1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    flat_store_dword v[0:1], v2
+; GFX8-NEXT:    s_endpgm
+;
+; GFX9-NODL-LABEL: udot4_acc32_3src_3ele:
+; GFX9-NODL:       ; %bb.0: ; %entry
+; GFX9-NODL-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[2:3]
+; GFX9-NODL-NEXT:    global_load_dword v3, v0, s[4:5]
+; GFX9-NODL-NEXT:    s_load_dword s0, s[6:7], 0x0
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NODL-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NODL-NEXT:    v_and_b32_e32 v4, 0xff, v1
+; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
+; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
+; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NODL-NEXT:    v_mad_u32_u24 v3, v4, v4, s0
+; GFX9-NODL-NEXT:    v_add3_u32 v1, v3, v2, v1
+; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX9-NODL-NEXT:    s_endpgm
+;
+; GFX9-DL-LABEL: udot4_acc32_3src_3ele:
+; GFX9-DL:       ; %bb.0: ; %entry
+; GFX9-DL-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-DL-NEXT:    global_load_dword v2, v0, s[4:5]
+; GFX9-DL-NEXT:    global_load_dword v3, v0, s[0:1]
+; GFX9-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
+; GFX9-DL-NEXT:    s_mov_b32 s0, 0xc06010c
+; GFX9-DL-NEXT:    s_mov_b32 s1, 0xc0c0c00
+; GFX9-DL-NEXT:    s_mov_b32 s2, 0xc020100
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-DL-NEXT:    v_perm_b32 v1, v2, v1, s0
+; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-DL-NEXT:    v_perm_b32 v2, v3, v3, s1
+; GFX9-DL-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX9-DL-NEXT:    v_perm_b32 v2, v3, v3, s2
+; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT:    v_dot4_u32_u8 v1, v2, v1, s3
+; GFX9-DL-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX9-DL-NEXT:    s_endpgm
+;
+; GFX10-DL-LABEL: udot4_acc32_3src_3ele:
+; GFX10-DL:       ; %bb.0: ; %entry
+; GFX10-DL-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_clause 0x2
+; GFX10-DL-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX10-DL-NEXT:    global_load_dword v2, v0, s[4:5]
+; GFX10-DL-NEXT:    global_load_dword v3, v0, s[0:1]
+; GFX10-DL-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-DL-NEXT:    s_load_dword s0, s[6:7], 0x0
+; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-DL-NEXT:    v_perm_b32 v0, v2, v1, 0xc06010c
+; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DL-NEXT:    v_perm_b32 v1, v3, v3, 0xc0c0c00
+; GFX10-DL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-DL-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX10-DL-NEXT:    v_perm_b32 v1, v3, v3, 0xc020100
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    v_dot4_u32_u8 v0, v1, v0, s0
+; GFX10-DL-NEXT:    global_store_dword v2, v0, s[6:7]
+; GFX10-DL-NEXT:    s_endpgm
+;
+; GFX11-DL-LABEL: udot4_acc32_3src_3ele:
+; GFX11-DL:       ; %bb.0: ; %entry
+; GFX11-DL-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_clause 0x2
+; GFX11-DL-NEXT:    global_load_b32 v1, v0, s[2:3]
+; GFX11-DL-NEXT:    global_load_b32 v2, v0, s[4:5]
+; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[0:1]
+; GFX11-DL-NEXT:    s_load_b32 s0, s[6:7], 0x0
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-DL-NEXT:    v_perm_b32 v1, v2, v1, 0xc06010c
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DL-NEXT:    v_perm_b32 v2, v0, v0, 0xc0c0c00
+; GFX11-DL-NEXT:    v_perm_b32 v0, v0, v0, 0xc020100
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-DL-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX11-DL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    v_dot4_u32_u8 v0, v0, v1, s0
+; GFX11-DL-NEXT:    global_store_b32 v2, v0, s[6:7]
+; GFX11-DL-NEXT:    s_nop 0
+; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DL-NEXT:    s_endpgm
+                                       ptr addrspace(1) %src2,
+                                       ptr addrspace(1) %src3,
+                                       ptr addrspace(1) nocapture %dst) {
+entry:
+  %idx = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
+  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
+  %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
+  %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
+  %gep3 = getelementptr <4 x i8>, ptr addrspace(1) %src3, i32 %idx
+  %vec3 = load <4 x i8>, ptr addrspace(1) %gep3
+
+  %v1e0 = extractelement <4 x i8> %vec1, i64 0
+  %cv1e0 = zext i8 %v1e0 to i32
+  %mul1 = mul nuw nsw i32 %cv1e0, %cv1e0
+
+  %v1e1 = extractelement <4 x i8> %vec1, i64 1
+  %cv1e1 = zext i8 %v1e1 to i32
+  %v2e1 = extractelement <4 x i8> %vec2, i64 1
+  %cv2e1 = zext i8 %v2e1 to i32
+  %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1
+
+  %v1e2 = extractelement <4 x i8> %vec1, i64 2
+  %cv1e2 = zext i8 %v1e2 to i32
+  %v3e2 = extractelement <4 x i8> %vec3, i64 2
+  %cv3e2 = zext i8 %v3e2 to i32
+  %mul3 = mul nuw nsw i32 %cv1e2, %cv3e2
+
+
+  %acc = load i32, ptr addrspace(1) %dst, align 4
+  %mad1 = add i32 %mul1, %acc
+  %mad2 = add i32 %mad1, %mul2
+  %mad3 = add i32 %mad2, %mul3
+
+  store i32 %mad3, ptr addrspace(1) %dst, align 4
+  ret void
+}
+
+define amdgpu_kernel void @udot4_bad_source(ptr addrspace(1) %src1,
+; GFX7-LABEL: udot4_bad_source:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GFX7-NEXT:    s_load_dword s12, s[0:1], 0xf
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    s_mov_b32 s10, 0
+; GFX7-NEXT:    s_mov_b32 s11, s3
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX7-NEXT:    v_mov_b32_e32 v1, 0
+; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x11
+; GFX7-NEXT:    s_and_b32 s5, s12, 0xffff
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xff, v2
+; GFX7-NEXT:    v_bfe_u32 v4, v2, 8, 8
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_bfe_u32 v5, v0, 8, 8
+; GFX7-NEXT:    v_mad_u32_u24 v1, v3, s5, v1
+; GFX7-NEXT:    v_bfe_u32 v2, v2, 16, 8
+; GFX7-NEXT:    v_bfe_u32 v0, v0, 16, 8
+; GFX7-NEXT:    v_mad_u32_u24 v1, v4, v5, v1
+; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, v1
+; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: udot4_bad_source:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x3c
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x44
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v0, v[0:1]
+; GFX8-NEXT:    s_load_dword s3, s[0:1], 0x0
+; GFX8-NEXT:    s_and_b32 s2, s2, 0xffff
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v1, s3
+; GFX8-NEXT:    s_waitcnt vmcnt(1)
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xff, v3
+; GFX8-NEXT:    v_bfe_u32 v4, v3, 8, 8
+; GFX8-NEXT:    v_mad_u32_u24 v1, v2, s2, v1
+; GFX8-NEXT:    v_bfe_u32 v3, v3, 16, 8
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_bfe_u32 v5, v0, 8, 8
+; GFX8-NEXT:    v_bfe_u32 v0, v0, 16, 8
+; GFX8-NEXT:    v_mad_u32_u24 v1, v4, v5, v1
+; GFX8-NEXT:    v_mad_u32_u24 v2, v3, v0, v1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    flat_store_dword v[0:1], v2
+; GFX8-NEXT:    s_endpgm
+;
+; GFX9-NODL-LABEL: udot4_bad_source:
+; GFX9-NODL:       ; %bb.0: ; %entry
+; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NODL-NEXT:    s_load_dword s2, s[0:1], 0x3c
+; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NODL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x44
+; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[4:5]
+; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[6:7]
+; GFX9-NODL-NEXT:    s_and_b32 s2, s2, 0xffff
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NODL-NEXT:    s_load_dword s3, s[0:1], 0x0
+; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NODL-NEXT:    v_and_b32_e32 v3, 0xff, v1
+; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
+; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
+; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s3
+; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, v3, s2, v2
+; GFX9-NODL-NEXT:    v_add3_u32 v1, v2, v4, v1
+; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NODL-NEXT:    s_endpgm
+;
+; GFX9-DL-LABEL: udot4_bad_source:
+; GFX9-DL:       ; %bb.0: ; %entry
+; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-DL-NEXT:    s_load_dword s2, s[0:1], 0x3c
+; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x44
+; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
+; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
+; GFX9-DL-NEXT:    s_mov_b32 s4, 0xc0c0201
+; GFX9-DL-NEXT:    s_and_b32 s2, s2, 0xffff
+; GFX9-DL-NEXT:    s_load_dword s3, s[0:1], 0x0
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s3
+; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-DL-NEXT:    v_and_b32_e32 v4, 0xff, v1
+; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-DL-NEXT:    v_perm_b32 v2, v2, v2, s4
+; GFX9-DL-NEXT:    v_mad_u32_u24 v3, v4, s2, v3
+; GFX9-DL-NEXT:    v_perm_b32 v1, v1, v1, s4
+; GFX9-DL-NEXT:    v_dot4_u32_u8 v1, v1, v2, v3
+; GFX9-DL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-DL-NEXT:    s_endpgm
+;
+; GFX10-DL-LABEL: udot4_bad_source:
+; GFX10-DL:       ; %bb.0: ; %entry
+; GFX10-DL-NEXT:    s_clause 0x1
+; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT:    s_load_dword s2, s[0:1], 0x3c
+; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x44
+; GFX10-DL-NEXT:    v_mov_b32_e32 v3, 0
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_clause 0x1
+; GFX10-DL-NEXT:    global_load_dword v1, v0, s[4:5]
+; GFX10-DL-NEXT:    global_load_dword v2, v0, s[6:7]
+; GFX10-DL-NEXT:    s_and_b32 s2, s2, 0xffff
+; GFX10-DL-NEXT:    s_load_dword s3, s[0:1], 0x0
+; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-DL-NEXT:    v_and_b32_e32 v0, 0xff, v1
+; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DL-NEXT:    v_perm_b32 v2, v2, v2, 0xc0c0201
+; GFX10-DL-NEXT:    v_perm_b32 v1, v1, v1, 0xc0c0201
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    v_mad_u32_u24 v0, v0, s2, s3
+; GFX10-DL-NEXT:    v_dot4_u32_u8 v0, v1, v2, v0
+; GFX10-DL-NEXT:    global_store_dword v3, v0, s[0:1]
+; GFX10-DL-NEXT:    s_endpgm
+;
+; GFX11-DL-LABEL: udot4_bad_source:
+; GFX11-DL:       ; %bb.0: ; %entry
+; GFX11-DL-NEXT:    s_clause 0x1
+; GFX11-DL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT:    s_load_b32 s2, s[0:1], 0x3c
+; GFX11-DL-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_lshlrev_b32 v0, 2, v0
+; GFX11-DL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x44
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_clause 0x1
+; GFX11-DL-NEXT:    global_load_b32 v1, v0, s[4:5]
+; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[6:7]
+; GFX11-DL-NEXT:    s_and_b32 s2, s2, 0xffff
+; GFX11-DL-NEXT:    s_load_b32 s3, s[0:1], 0x0
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-DL-NEXT:    v_and_b32_e32 v2, 0xff, v1
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DL-NEXT:    v_perm_b32 v0, v0, v0, 0xc0c0201
+; GFX11-DL-NEXT:    v_perm_b32 v1, v1, v1, 0xc0c0201
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    v_mad_u32_u24 v2, v2, s2, s3
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DL-NEXT:    v_dot4_u32_u8 v0, v1, v0, v2
+; GFX11-DL-NEXT:    global_store_b32 v3, v0, s[0:1]
+; GFX11-DL-NEXT:    s_nop 0
+; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DL-NEXT:    s_endpgm
+                                       ptr addrspace(1) %src2,
+                                       ptr addrspace(1) %src3,
+                                       i16 %badsource,
+                                       ptr addrspace(1) nocapture %dst) {
+entry:
+  %idx = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
+  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
+  %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
+  %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
+  %gep3 = getelementptr <4 x i8>, ptr addrspace(1) %src3, i32 %idx
+  %vec3 = load <4 x i8>, ptr addrspace(1) %gep3
+
+  %v1e0 = extractelement <4 x i8> %vec1, i64 0
+  %cv1e0 = zext i8 %v1e0 to i32
+  %v2e0 = extractelement <4 x i8> %vec2, i64 0
+  %other = zext i16 %badsource to i32
+  %mul1 = mul nuw nsw i32 %cv1e0, %other
+
+  %v1e1 = extractelement <4 x i8> %vec1, i64 1
+  %cv1e1 = zext i8 %v1e1 to i32
+  %v2e1 = extractelement <4 x i8> %vec2, i64 1
+  %cv2e1 = zext i8 %v2e1 to i32
+  %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1
+
+  %v2e2 = extractelement <4 x i8> %vec2, i64 2
+  %cv2e2 = zext i8 %v2e2 to i32
+  %v1e2 = extractelement <4 x i8> %vec1, i64 2
+  %cv1e2 = zext i8 %v1e2 to i32
+  %mul3 = mul nuw nsw i32 %cv1e2, %cv2e2
+
+
+  %acc = load i32, ptr addrspace(1) %dst, align 4
+  %mad1 = add i32 %mul1, %acc
+  %mad2 = add i32 %mad1, %mul2
+  %mad3 = add i32 %mad2, %mul3
+
+  store i32 %mad3, ptr addrspace(1) %dst, align 4
+  ret void
+}
+
+
+define amdgpu_kernel void @udot4_commutative(ptr addrspace(1) %src1,
+; GFX7-LABEL: udot4_commutative:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xf
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    s_mov_b32 s10, 0
+; GFX7-NEXT:    s_mov_b32 s11, s3
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX7-NEXT:    v_mov_b32_e32 v1, 0
+; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xff, v2
+; GFX7-NEXT:    v_bfe_u32 v3, v2, 8, 8
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xff, v0
+; GFX7-NEXT:    v_bfe_u32 v5, v0, 8, 8
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mad_u32_u24 v1, v1, v4, s4
+; GFX7-NEXT:    v_bfe_u32 v0, v0, 16, 8
+; GFX7-NEXT:    v_bfe_u32 v2, v2, 16, 8
+; GFX7-NEXT:    v_mad_u32_u24 v1, v3, v5, v1
+; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, v1
+; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: udot4_commutative:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x3c
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v0, v[0:1]
+; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX8-NEXT:    s_waitcnt vmcnt(1)
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xff, v3
+; GFX8-NEXT:    v_bfe_u32 v4, v3, 8, 8
+; GFX8-NEXT:    v_bfe_u32 v3, v3, 16, 8
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xff, v0
+; GFX8-NEXT:    v_bfe_u32 v5, v0, 8, 8
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mad_u32_u24 v1, v1, v2, s2
+; GFX8-NEXT:    v_bfe_u32 v0, v0, 16, 8
+; GFX8-NEXT:    v_mad_u32_u24 v1, v4, v5, v1
+; GFX8-NEXT:    v_mad_u32_u24 v2, v3, v0, v1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    flat_store_dword v[0:1], v2
+; GFX8-NEXT:    s_endpgm
+;
+; GFX9-NODL-LABEL: udot4_commutative:
+; GFX9-NODL:       ; %bb.0: ; %entry
+; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NODL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x3c
+; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[4:5]
+; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[6:7]
+; GFX9-NODL-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NODL-NEXT:    v_and_b32_e32 v3, 0xff, v1
+; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NODL-NEXT:    v_and_b32_e32 v4, 0xff, v2
+; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
+; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
+; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, v3, v4, s0
+; GFX9-NODL-NEXT:    v_add3_u32 v1, v2, v5, v1
+; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX9-NODL-NEXT:    s_endpgm
+;
+; GFX9-DL-LABEL: udot4_commutative:
+; GFX9-DL:       ; %bb.0: ; %entry
+; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x3c
+; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-DL-NEXT:    s_mov_b32 s1, 0xc020100
+; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-DL-NEXT:    global_load_dword v2, v0, s[4:5]
+; GFX9-DL-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-DL-NEXT:    v_perm_b32 v1, v1, v1, s1
+; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-DL-NEXT:    v_perm_b32 v2, v2, v2, s1
+; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT:    v_dot4_u32_u8 v1, v2, v1, s0
+; GFX9-DL-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX9-DL-NEXT:    s_endpgm
+;
+; GFX10-DL-LABEL: udot4_commutative:
+; GFX10-DL:       ; %bb.0: ; %entry
+; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x3c
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_clause 0x1
+; GFX10-DL-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX10-DL-NEXT:    global_load_dword v2, v0, s[4:5]
+; GFX10-DL-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-DL-NEXT:    v_perm_b32 v0, v1, v1, 0xc020100
+; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DL-NEXT:    v_perm_b32 v1, v2, v2, 0xc020100
+; GFX10-DL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    v_dot4_u32_u8 v0, v1, v0, s2
+; GFX10-DL-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-DL-NEXT:    s_endpgm
+;
+; GFX11-DL-LABEL: udot4_commutative:
+; GFX11-DL:       ; %bb.0: ; %entry
+; GFX11-DL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-DL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x3c
+; GFX11-DL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_clause 0x1
+; GFX11-DL-NEXT:    global_load_b32 v1, v0, s[6:7]
+; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[4:5]
+; GFX11-DL-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-DL-NEXT:    v_perm_b32 v1, v1, v1, 0xc020100
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DL-NEXT:    v_perm_b32 v0, v0, v0, 0xc020100
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DL-NEXT:    v_dot4_u32_u8 v0, v0, v1, s2
+; GFX11-DL-NEXT:    global_store_b32 v2, v0, s[0:1]
+; GFX11-DL-NEXT:    s_nop 0
+; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DL-NEXT:    s_endpgm
+                                       ptr addrspace(1) %src2,
+                                       ptr addrspace(1) %src3,
+                                       ptr addrspace(1) nocapture %dst) {
+entry:
+  %idx = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
+  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
+  %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
+  %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
+  %gep3 = getelementptr <4 x i8>, ptr addrspace(1) %src3, i32 %idx
+  %vec3 = load <4 x i8>, ptr addrspace(1) %gep3
+
+  %v1e0 = extractelement <4 x i8> %vec1, i64 0
+  %cv1e0 = zext i8 %v1e0 to i32
+  %v2e0 = extractelement <4 x i8> %vec2, i64 0
+  %cv2e0 = zext i8 %v2e0 to i32
+  %mul1 = mul nuw nsw i32 %cv1e0, %cv2e0
+
+  %v1e1 = extractelement <4 x i8> %vec1, i64 1
+  %cv1e1 = zext i8 %v1e1 to i32
+  %v2e1 = extractelement <4 x i8> %vec2, i64 1
+  %cv2e1 = zext i8 %v2e1 to i32
+  %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1
+
+  %v2e2 = extractelement <4 x i8> %vec2, i64 2
+  %cv2e2 = zext i8 %v2e2 to i32
+  %v1e2 = extractelement <4 x i8> %vec1, i64 2
+  %cv1e2 = zext i8 %v1e2 to i32
+  %mul3 = mul nuw nsw i32 %cv1e2, %cv2e2
+
+
+  %acc = load i32, ptr addrspace(1) %dst, align 4
+  %mad1 = add i32 %mul1, %acc
+  %mad2 = add i32 %mad1, %mul2
+  %mad3 = add i32 %mad2, %mul3
+
+  store i32 %mad3, ptr addrspace(1) %dst, align 4
+  ret void
+}
+
+define amdgpu_kernel void @udot4_acc32_3src_3ele_src0(ptr addrspace(1) %src1,
+; GFX7-LABEL: udot4_acc32_3src_3ele_src0:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; GFX7-NEXT:    s_mov_b32 s11, 0xf000
+; GFX7-NEXT:    s_mov_b32 s14, 0
+; GFX7-NEXT:    s_mov_b32 s15, s11
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX7-NEXT:    v_mov_b32_e32 v1, 0
+; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[12:15], 0 addr64
+; GFX7-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64
+; GFX7-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[12:15], 0 addr64
+; GFX7-NEXT:    s_load_dword s0, s[6:7], 0x0
+; GFX7-NEXT:    s_mov_b32 s10, -1
+; GFX7-NEXT:    s_mov_b32 s8, s6
+; GFX7-NEXT:    s_mov_b32 s9, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(2)
+; GFX7-NEXT:    v_bfe_u32 v1, v2, 8, 8
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_bfe_u32 v2, v3, 8, 8
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mad_u32_u24 v4, v2, v2, s0
+; GFX7-NEXT:    v_bfe_u32 v3, v3, 16, 8
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_bfe_u32 v0, v0, 16, 8
+; GFX7-NEXT:    v_mad_u32_u24 v1, v1, v2, v4
+; GFX7-NEXT:    v_mad_u32_u24 v0, v3, v0, v1
+; GFX7-NEXT:    buffer_store_dword v0, off, s[8:11], 0
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: udot4_acc32_3src_3ele_src0:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v1, s3
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v0, v[0:1]
+; GFX8-NEXT:    s_load_dword s0, s[6:7], 0x0
+; GFX8-NEXT:    s_waitcnt vmcnt(2)
+; GFX8-NEXT:    v_bfe_u32 v2, v3, 8, 8
+; GFX8-NEXT:    s_waitcnt vmcnt(1)
+; GFX8-NEXT:    v_bfe_u32 v1, v4, 8, 8
+; GFX8-NEXT:    v_bfe_u32 v3, v4, 16, 8
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mad_u32_u24 v4, v1, v1, s0
+; GFX8-NEXT:    v_mad_u32_u24 v1, v2, v1, v4
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_bfe_u32 v0, v0, 16, 8
+; GFX8-NEXT:    v_mad_u32_u24 v2, v3, v0, v1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    flat_store_dword v[0:1], v2
+; GFX8-NEXT:    s_endpgm
+;
+; GFX9-NODL-LABEL: udot4_acc32_3src_3ele_src0:
+; GFX9-NODL:       ; %bb.0: ; %entry
+; GFX9-NODL-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[4:5]
+; GFX9-NODL-NEXT:    global_load_dword v3, v0, s[0:1]
+; GFX9-NODL-NEXT:    s_load_dword s0, s[6:7], 0x0
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NODL-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NODL-NEXT:    v_bfe_u32 v4, v1, 8, 8
+; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
+; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v2, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NODL-NEXT:    v_mad_u32_u24 v3, v4, v4, s0
+; GFX9-NODL-NEXT:    v_add3_u32 v1, v3, v2, v1
+; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX9-NODL-NEXT:    s_endpgm
+;
+; GFX9-DL-LABEL: udot4_acc32_3src_3ele_src0:
+; GFX9-DL:       ; %bb.0: ; %entry
+; GFX9-DL-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
+; GFX9-DL-NEXT:    global_load_dword v2, v0, s[0:1]
+; GFX9-DL-NEXT:    global_load_dword v3, v0, s[2:3]
+; GFX9-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
+; GFX9-DL-NEXT:    s_mov_b32 s0, 0xc06010c
+; GFX9-DL-NEXT:    s_mov_b32 s1, 0xc0c0c01
+; GFX9-DL-NEXT:    s_mov_b32 s2, 0xc020101
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-DL-NEXT:    v_perm_b32 v1, v1, v2, s0
+; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-DL-NEXT:    v_perm_b32 v2, v3, v3, s1
+; GFX9-DL-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX9-DL-NEXT:    v_perm_b32 v2, v3, v3, s2
+; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT:    v_dot4_u32_u8 v1, v2, v1, s3
+; GFX9-DL-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX9-DL-NEXT:    s_endpgm
+;
+; GFX10-DL-LABEL: udot4_acc32_3src_3ele_src0:
+; GFX10-DL:       ; %bb.0: ; %entry
+; GFX10-DL-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_clause 0x2
+; GFX10-DL-NEXT:    global_load_dword v1, v0, s[4:5]
+; GFX10-DL-NEXT:    global_load_dword v2, v0, s[0:1]
+; GFX10-DL-NEXT:    global_load_dword v3, v0, s[2:3]
+; GFX10-DL-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-DL-NEXT:    s_load_dword s0, s[6:7], 0x0
+; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-DL-NEXT:    v_perm_b32 v0, v1, v2, 0xc06010c
+; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DL-NEXT:    v_perm_b32 v1, v3, v3, 0xc0c0c01
+; GFX10-DL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-DL-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX10-DL-NEXT:    v_perm_b32 v1, v3, v3, 0xc020101
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    v_dot4_u32_u8 v0, v1, v0, s0
+; GFX10-DL-NEXT:    global_store_dword v2, v0, s[6:7]
+; GFX10-DL-NEXT:    s_endpgm
+;
+; GFX11-DL-LABEL: udot4_acc32_3src_3ele_src0:
+; GFX11-DL:       ; %bb.0: ; %entry
+; GFX11-DL-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_clause 0x2
+; GFX11-DL-NEXT:    global_load_b32 v1, v0, s[4:5]
+; GFX11-DL-NEXT:    global_load_b32 v2, v0, s[0:1]
+; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[2:3]
+; GFX11-DL-NEXT:    s_load_b32 s0, s[6:7], 0x0
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-DL-NEXT:    v_perm_b32 v1, v1, v2, 0xc06010c
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DL-NEXT:    v_perm_b32 v2, v0, v0, 0xc0c0c01
+; GFX11-DL-NEXT:    v_perm_b32 v0, v0, v0, 0xc020101
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-DL-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX11-DL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    v_dot4_u32_u8 v0, v0, v1, s0
+; GFX11-DL-NEXT:    global_store_b32 v2, v0, s[6:7]
+; GFX11-DL-NEXT:    s_nop 0
+; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DL-NEXT:    s_endpgm
+                                       ptr addrspace(1) %src2,
+                                       ptr addrspace(1) %src3,
+                                       ptr addrspace(1) nocapture %dst) {
+entry:
+  %idx = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
+  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
+  %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
+  %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
+  %gep3 = getelementptr <4 x i8>, ptr addrspace(1) %src3, i32 %idx
+  %vec3 = load <4 x i8>, ptr addrspace(1) %gep3
+
+  %v2e0 = extractelement <4 x i8> %vec2, i64 1
+  %cv2e0 = zext i8 %v2e0 to i32
+  %mul1 = mul nuw nsw i32 %cv2e0, %cv2e0
+
+  %v1e1 = extractelement <4 x i8> %vec1, i64 1
+  %cv1e1 = zext i8 %v1e1 to i32
+  %v2e1 = extractelement <4 x i8> %vec2, i64 1
+  %cv2e1 = zext i8 %v2e1 to i32
+  %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1
+
+  %v3e2 = extractelement <4 x i8> %vec3, i64 2
+  %cv3e2 = zext i8 %v3e2 to i32
+  %v2e2 = extractelement <4 x i8> %vec2, i64 2
+  %cv2e2 = zext i8 %v2e2 to i32
+  %mul3 = mul nuw nsw i32 %cv2e2, %cv3e2
+
+
+  %acc = load i32, ptr addrspace(1) %dst, align 4
+  %mad1 = add i32 %mul1, %acc
+  %mad2 = add i32 %mad1, %mul2
+  %mad3 = add i32 %mad2, %mul3
+
+  store i32 %mad3, ptr addrspace(1) %dst, align 4
+  ret void
+}
+
+define amdgpu_kernel void @udot4_4src(ptr addrspace(1) %src1,
+; GFX7-LABEL: udot4_4src:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x9
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    s_mov_b32 s14, 0
+; GFX7-NEXT:    s_mov_b32 s15, s3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; GFX7-NEXT:    v_mov_b32_e32 v1, 0
+; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[12:15], 0 addr64
+; GFX7-NEXT:    s_mov_b64 s[12:13], s[6:7]
+; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64
+; GFX7-NEXT:    s_mov_b64 s[12:13], s[8:9]
+; GFX7-NEXT:    buffer_load_dword v4, v[0:1], s[12:15], 0 addr64
+; GFX7-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[12:15], 0 addr64
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x11
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX7-NEXT:    s_waitcnt vmcnt(3)
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xff, v2
+; GFX7-NEXT:    v_bfe_u32 v2, v2, 8, 8
+; GFX7-NEXT:    s_waitcnt vmcnt(2)
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xff, v3
+; GFX7-NEXT:    v_bfe_u32 v3, v3, 8, 8
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mad_u32_u24 v1, v1, v2, s4
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xff, v4
+; GFX7-NEXT:    v_bfe_u32 v4, v4, 8, 8
+; GFX7-NEXT:    v_mad_u32_u24 v1, v5, v3, v1
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xff, v0
+; GFX7-NEXT:    v_bfe_u32 v0, v0, 8, 8
+; GFX7-NEXT:    v_mad_u32_u24 v1, v2, v4, v1
+; GFX7-NEXT:    v_mad_u32_u24 v0, v3, v0, v1
+; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: udot4_4src:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x24
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x44
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v1, s9
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s8, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v5, v[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v1, s11
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s10, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v0, v[0:1]
+; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX8-NEXT:    s_waitcnt vmcnt(3)
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xff, v3
+; GFX8-NEXT:    v_bfe_u32 v2, v3, 8, 8
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mad_u32_u24 v1, v1, v2, s2
+; GFX8-NEXT:    s_waitcnt vmcnt(2)
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xff, v4
+; GFX8-NEXT:    v_bfe_u32 v4, v4, 8, 8
+; GFX8-NEXT:    v_mad_u32_u24 v1, v3, v4, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(1)
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xff, v5
+; GFX8-NEXT:    v_bfe_u32 v5, v5, 8, 8
+; GFX8-NEXT:    v_mad_u32_u24 v1, v6, v5, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v7, 0xff, v0
+; GFX8-NEXT:    v_bfe_u32 v0, v0, 8, 8
+; GFX8-NEXT:    v_mad_u32_u24 v2, v7, v0, v1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    flat_store_dword v[0:1], v2
+; GFX8-NEXT:    s_endpgm
+;
+; GFX9-NODL-LABEL: udot4_4src:
+; GFX9-NODL:       ; %bb.0: ; %entry
+; GFX9-NODL-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x24
+; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NODL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x44
+; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[4:5]
+; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[6:7]
+; GFX9-NODL-NEXT:    global_load_dword v3, v0, s[8:9]
+; GFX9-NODL-NEXT:    global_load_dword v4, v0, s[10:11]
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NODL-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX9-NODL-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_1
+; GFX9-NODL-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v2, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_1
+; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v3, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_1
+; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v4, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_1
+; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NODL-NEXT:    v_add3_u32 v1, v1, s2, v2
+; GFX9-NODL-NEXT:    v_add3_u32 v1, v1, v3, v4
+; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NODL-NEXT:    s_endpgm
+;
+; GFX9-DL-LABEL: udot4_4src:
+; GFX9-DL:       ; %bb.0: ; %entry
+; GFX9-DL-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x24
+; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x44
+; GFX9-DL-NEXT:    s_mov_b32 s2, 0xc0c0501
+; GFX9-DL-NEXT:    s_mov_b32 s3, 0x5010c0c
+; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
+; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
+; GFX9-DL-NEXT:    global_load_dword v3, v0, s[8:9]
+; GFX9-DL-NEXT:    global_load_dword v4, v0, s[10:11]
+; GFX9-DL-NEXT:    s_mov_b32 s4, 0xc0c0400
+; GFX9-DL-NEXT:    s_load_dword s6, s[0:1], 0x0
+; GFX9-DL-NEXT:    s_mov_b32 s5, 0x4000c0c
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-DL-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-DL-NEXT:    v_perm_b32 v5, v2, v1, s2
+; GFX9-DL-NEXT:    v_perm_b32 v1, v2, v1, s4
+; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-DL-NEXT:    v_perm_b32 v6, v4, v3, s3
+; GFX9-DL-NEXT:    v_perm_b32 v2, v4, v3, s5
+; GFX9-DL-NEXT:    v_or_b32_e32 v3, v6, v5
+; GFX9-DL-NEXT:    v_or_b32_e32 v1, v2, v1
+; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT:    v_dot4_u32_u8 v1, v1, v3, s6
+; GFX9-DL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-DL-NEXT:    s_endpgm
+;
+; GFX10-DL-LABEL: udot4_4src:
+; GFX10-DL:       ; %bb.0: ; %entry
+; GFX10-DL-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x24
+; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x44
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_clause 0x3
+; GFX10-DL-NEXT:    global_load_dword v1, v0, s[4:5]
+; GFX10-DL-NEXT:    global_load_dword v2, v0, s[6:7]
+; GFX10-DL-NEXT:    global_load_dword v3, v0, s[8:9]
+; GFX10-DL-NEXT:    global_load_dword v4, v0, s[10:11]
+; GFX10-DL-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NEXT:    s_waitcnt vmcnt(2)
+; GFX10-DL-NEXT:    v_perm_b32 v0, v2, v1, 0xc0c0501
+; GFX10-DL-NEXT:    v_perm_b32 v1, v2, v1, 0xc0c0400
+; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DL-NEXT:    v_perm_b32 v5, v4, v3, 0x5010c0c
+; GFX10-DL-NEXT:    v_perm_b32 v2, v4, v3, 0x4000c0c
+; GFX10-DL-NEXT:    v_or_b32_e32 v0, v5, v0
+; GFX10-DL-NEXT:    v_or_b32_e32 v1, v2, v1
+; GFX10-DL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    v_dot4_u32_u8 v0, v1, v0, s2
+; GFX10-DL-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-DL-NEXT:    s_endpgm
+;
+; GFX11-DL-LABEL: udot4_4src:
+; GFX11-DL:       ; %bb.0: ; %entry
+; GFX11-DL-NEXT:    s_load_b256 s[4:11], s[0:1], 0x24
+; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-DL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x44
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_clause 0x3
+; GFX11-DL-NEXT:    global_load_b32 v1, v0, s[4:5]
+; GFX11-DL-NEXT:    global_load_b32 v2, v0, s[6:7]
+; GFX11-DL-NEXT:    global_load_b32 v3, v0, s[8:9]
+; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[10:11]
+; GFX11-DL-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-DL-NEXT:    v_perm_b32 v4, v2, v1, 0xc0c0501
+; GFX11-DL-NEXT:    v_perm_b32 v1, v2, v1, 0xc0c0400
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DL-NEXT:    v_perm_b32 v5, v0, v3, 0x5010c0c
+; GFX11-DL-NEXT:    v_perm_b32 v0, v0, v3, 0x4000c0c
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-DL-NEXT:    v_or_b32_e32 v2, v5, v4
+; GFX11-DL-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-DL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-DL-NEXT:    v_dot4_u32_u8 v0, v0, v2, s2
+; GFX11-DL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-DL-NEXT:    s_nop 0
+; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DL-NEXT:    s_endpgm
+                                       ptr addrspace(1) %src2,
+                                       ptr addrspace(1) %src3,
+                                       ptr addrspace(1) %src4,
+                                       ptr addrspace(1) nocapture %dst) {
+entry:
+  %idx = call i32 @llvm.amdgcn.workitem.id.x()
+
+  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
+  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
+  %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
+  %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
+  %gep3 = getelementptr <4 x i8>, ptr addrspace(1) %src3, i32 %idx
+  %vec3 = load <4 x i8>, ptr addrspace(1) %gep3
+  %gep4 = getelementptr <4 x i8>, ptr addrspace(1) %src4, i32 %idx
+  %vec4 = load <4 x i8>, ptr addrspace(1) %gep4
+
+
+  %v1e0 = extractelement <4 x i8> %vec1, i64 0
+  %cv1e0 = zext i8 %v1e0 to i32
+  %v1e1 = extractelement <4 x i8> %vec1, i64 1
+  %cv1e1 = zext i8 %v1e1 to i32
+  %mul1 = mul nuw nsw i32 %cv1e0, %cv1e1
+
+  %v2e0 = extractelement <4 x i8> %vec2, i64 0
+  %cv2e0 = zext i8 %v2e0 to i32
+  %v2e1 = extractelement <4 x i8> %vec2, i64 1
+  %cv2e1 = zext i8 %v2e1 to i32
+  %mul2 = mul nuw nsw i32 %cv2e0, %cv2e1
+
+  %v3e0 = extractelement <4 x i8> %vec3, i64 0
+  %cv3e0 = zext i8 %v3e0 to i32
+  %v3e1 = extractelement <4 x i8> %vec3, i64 1
+  %cv3e1 = zext i8 %v3e1 to i32
+  %mul3 = mul nuw nsw i32 %cv3e0, %cv3e1
+
+  %v4e0 = extractelement <4 x i8> %vec4, i64 0
+  %cv4e0 = zext i8 %v4e0 to i32
+  %v4e1 = extractelement <4 x i8> %vec4, i64 1
+  %cv4e1 = zext i8 %v4e1 to i32
+  %mul4 = mul nuw nsw i32 %cv4e0, %cv4e1
+
+
+  %acc = load i32, ptr addrspace(1) %dst, align 4
+  %mad1 = add i32 %mul1, %acc
+  %mad2 = add i32 %mad1, %mul2
+  %mad3 = add i32 %mad2, %mul3
+  %mad4 = add i32 %mad3, %mul4
+
+  store i32 %mad4, ptr addrspace(1) %dst, align 4
+  ret void
+}
+
+define amdgpu_kernel void @udot4_acc32_multi(ptr addrspace(1) %src1,
+; GFX7-LABEL: udot4_acc32_multi:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    s_mov_b32 s10, 0
+; GFX7-NEXT:    s_mov_b32 s11, s3
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v1, 0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_mov_b64 s[6:7], s[10:11]
+; GFX7-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xff, v2
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xff, v0
+; GFX7-NEXT:    v_bfe_u32 v7, v2, 16, 8
+; GFX7-NEXT:    v_bfe_u32 v8, v0, 16, 8
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mad_u32_u24 v1, v1, v4, s4
+; GFX7-NEXT:    v_and_b32_e32 v9, 0xff, v3
+; GFX7-NEXT:    v_mad_u32_u24 v1, v7, v8, v1
+; GFX7-NEXT:    v_bfe_u32 v11, v3, 16, 8
+; GFX7-NEXT:    v_mad_u32_u24 v1, v9, v4, v1
+; GFX7-NEXT:    v_bfe_u32 v5, v2, 8, 8
+; GFX7-NEXT:    v_bfe_u32 v6, v0, 8, 8
+; GFX7-NEXT:    v_mad_u32_u24 v1, v11, v8, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
+; GFX7-NEXT:    v_mad_u32_u24 v1, v5, v6, v1
+; GFX7-NEXT:    v_bfe_u32 v10, v3, 8, 8
+; GFX7-NEXT:    v_mad_u32_u24 v1, v2, v0, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 24, v3
+; GFX7-NEXT:    v_mad_u32_u24 v1, v10, v6, v1
+; GFX7-NEXT:    v_mad_u32_u24 v0, v3, v0, v1
+; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: udot4_acc32_multi:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v3, s7
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s6, v2
+; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GFX8-NEXT:    flat_load_dword v2, v[2:3]
+; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX8-NEXT:    s_waitcnt vmcnt(1)
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xff, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v4, 0xff, v2
+; GFX8-NEXT:    v_bfe_u32 v7, v0, 16, 8
+; GFX8-NEXT:    v_bfe_u32 v8, v2, 16, 8
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mad_u32_u24 v3, v3, v4, s2
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xff, v1
+; GFX8-NEXT:    v_mad_u32_u24 v3, v7, v8, v3
+; GFX8-NEXT:    v_bfe_u32 v11, v1, 16, 8
+; GFX8-NEXT:    v_mad_u32_u24 v3, v9, v4, v3
+; GFX8-NEXT:    v_bfe_u32 v5, v0, 8, 8
+; GFX8-NEXT:    v_bfe_u32 v6, v2, 8, 8
+; GFX8-NEXT:    v_mad_u32_u24 v3, v11, v8, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
+; GFX8-NEXT:    v_mad_u32_u24 v3, v5, v6, v3
+; GFX8-NEXT:    v_bfe_u32 v10, v1, 8, 8
+; GFX8-NEXT:    v_mad_u32_u24 v0, v0, v2, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 24, v1
+; GFX8-NEXT:    v_mad_u32_u24 v0, v10, v6, v0
+; GFX8-NEXT:    v_mad_u32_u24 v2, v1, v2, v0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    flat_store_dword v[0:1], v2
+; GFX8-NEXT:    s_endpgm
+;
+; GFX9-NODL-LABEL: udot4_acc32_multi:
+; GFX9-NODL:       ; %bb.0: ; %entry
+; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NODL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
+; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NODL-NEXT:    global_load_dword v3, v2, s[6:7]
+; GFX9-NODL-NEXT:    global_load_dwordx2 v[0:1], v2, s[4:5]
+; GFX9-NODL-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NODL-NEXT:    v_and_b32_e32 v4, 0xff, v3
+; GFX9-NODL-NEXT:    v_bfe_u32 v6, v3, 16, 8
+; GFX9-NODL-NEXT:    v_bfe_u32 v5, v3, 8, 8
+; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v3, 24, v3
+; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v7, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v9, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:DWORD
+; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v8, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v4, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v5, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v6, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:DWORD
+; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NODL-NEXT:    v_add3_u32 v3, v7, s0, v9
+; GFX9-NODL-NEXT:    v_add3_u32 v3, v3, v4, v6
+; GFX9-NODL-NEXT:    v_add3_u32 v0, v8, v3, v0
+; GFX9-NODL-NEXT:    v_add3_u32 v0, v0, v5, v1
+; GFX9-NODL-NEXT:    global_store_dword v2, v0, s[2:3]
+; GFX9-NODL-NEXT:    s_endpgm
+;
+; GFX9-DL-LABEL: udot4_acc32_multi:
+; GFX9-DL:       ; %bb.0: ; %entry
+; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
+; GFX9-DL-NEXT:    s_mov_b32 s0, 0x6040200
+; GFX9-DL-NEXT:    s_mov_b32 s1, 0x2000200
+; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT:    global_load_dwordx2 v[0:1], v2, s[4:5]
+; GFX9-DL-NEXT:    global_load_dword v3, v2, s[6:7]
+; GFX9-DL-NEXT:    s_load_dword s5, s[2:3], 0x0
+; GFX9-DL-NEXT:    s_mov_b32 s4, 0x7050301
+; GFX9-DL-NEXT:    s_mov_b32 s6, 0x3010301
+; GFX9-DL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-DL-NEXT:    v_perm_b32 v4, v1, v0, s0
+; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-DL-NEXT:    v_perm_b32 v5, v3, v3, s1
+; GFX9-DL-NEXT:    v_perm_b32 v0, v1, v0, s4
+; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT:    v_dot4_u32_u8 v1, v4, v5, s5
+; GFX9-DL-NEXT:    v_perm_b32 v3, v3, v3, s6
+; GFX9-DL-NEXT:    v_dot4_u32_u8 v0, v0, v3, v1
+; GFX9-DL-NEXT:    global_store_dword v2, v0, s[2:3]
+; GFX9-DL-NEXT:    s_endpgm
+;
+; GFX10-DL-LABEL: udot4_acc32_multi:
+; GFX10-DL:       ; %bb.0: ; %entry
+; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
+; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_clause 0x1
+; GFX10-DL-NEXT:    global_load_dwordx2 v[0:1], v2, s[4:5]
+; GFX10-DL-NEXT:    global_load_dword v3, v2, s[6:7]
+; GFX10-DL-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-DL-NEXT:    v_perm_b32 v2, v1, v0, 0x6040200
+; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DL-NEXT:    v_perm_b32 v4, v3, v3, 0x2000200
+; GFX10-DL-NEXT:    v_perm_b32 v0, v1, v0, 0x7050301
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    v_dot4_u32_u8 v1, v2, v4, s2
+; GFX10-DL-NEXT:    v_perm_b32 v2, v3, v3, 0x3010301
+; GFX10-DL-NEXT:    v_mov_b32_e32 v3, 0
+; GFX10-DL-NEXT:    v_dot4_u32_u8 v0, v0, v2, v1
+; GFX10-DL-NEXT:    global_store_dword v3, v0, s[0:1]
+; GFX10-DL-NEXT:    s_endpgm
+;
+; GFX11-DL-LABEL: udot4_acc32_multi:
+; GFX11-DL:       ; %bb.0: ; %entry
+; GFX11-DL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
+; GFX11-DL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_clause 0x1
+; GFX11-DL-NEXT:    global_load_b64 v[0:1], v2, s[4:5]
+; GFX11-DL-NEXT:    global_load_b32 v2, v2, s[6:7]
+; GFX11-DL-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-DL-NEXT:    v_perm_b32 v3, v1, v0, 0x6040200
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DL-NEXT:    v_perm_b32 v4, v2, v2, 0x2000200
+; GFX11-DL-NEXT:    v_perm_b32 v0, v1, v0, 0x7050301
+; GFX11-DL-NEXT:    v_perm_b32 v2, v2, v2, 0x3010301
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-DL-NEXT:    v_dot4_u32_u8 v1, v3, v4, s2
+; GFX11-DL-NEXT:    v_mov_b32_e32 v3, 0
+; GFX11-DL-NEXT:    v_dot4_u32_u8 v0, v0, v2, v1
+; GFX11-DL-NEXT:    global_store_b32 v3, v0, s[0:1]
+; GFX11-DL-NEXT:    s_nop 0
+; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DL-NEXT:    s_endpgm
+                                       ptr addrspace(1) %src2,
+                                       ptr addrspace(1) nocapture %dst) {
+entry:
+  %idx = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx
+  %vec1 = load <8 x i8>, ptr addrspace(1) %gep1
+  %gep2 = getelementptr <8 x i8>, ptr addrspace(1) %src2, i32 %idx
+  %vec2 = load <8 x i8>, ptr addrspace(1) %gep2
+
+  %v1e0 = extractelement <8 x i8> %vec1, i64 0
+  %cv1e0 = zext i8 %v1e0 to i32
+  %v2e0 = extractelement <8 x i8> %vec2, i64 0
+  %cv2e0 = zext i8 %v2e0 to i32
+  %mul1 = mul nuw nsw i32 %cv1e0, %cv2e0
+
+  %v1e1 = extractelement <8 x i8> %vec1, i64 1
+  %cv1e1 = zext i8 %v1e1 to i32
+  %v2e1 = extractelement <8 x i8> %vec2, i64 1
+  %cv2e1 = zext i8 %v2e1 to i32
+  %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1
+
+  %v1e2 = extractelement <8 x i8> %vec1, i64 2
+  %cv1e2 = zext i8 %v1e2 to i32
+  %v2e2 = extractelement <8 x i8> %vec2, i64 2
+  %cv2e2 = zext i8 %v2e2 to i32
+  %mul3 = mul nuw nsw i32 %cv1e2, %cv2e2
+
+  %v1e3 = extractelement <8 x i8> %vec1, i64 3
+  %cv1e3 = zext i8 %v1e3 to i32
+  %v2e3 = extractelement <8 x i8> %vec2, i64 3
+  %cv2e3 = zext i8 %v2e3 to i32
+  %mul4 = mul nuw nsw i32 %cv1e3, %cv2e3
+
+  %v1e4 = extractelement <8 x i8> %vec1, i64 4
+  %cv1e4 = zext i8 %v1e4 to i32
+  %v2e4 = extractelement <8 x i8> %vec2, i64 4
+  %cv2e4 = zext i8 %v2e4 to i32
+  %mul5 = mul nuw nsw i32 %cv1e4, %cv2e0
+
+  %v1e5 = extractelement <8 x i8> %vec1, i64 5
+  %cv1e5 = zext i8 %v1e5 to i32
+  %v2e5 = extractelement <8 x i8> %vec2, i64 5
+  %cv2e5 = zext i8 %v2e5 to i32
+  %mul6 = mul nuw nsw i32 %cv1e5, %cv2e1
+
+  %v1e6 = extractelement <8 x i8> %vec1, i64 6
+  %cv1e6 = zext i8 %v1e6 to i32
+  %v2e6 = extractelement <8 x i8> %vec2, i64 6
+  %cv2e6 = zext i8 %v2e6 to i32
+  %mul7 = mul nuw nsw i32 %cv1e6, %cv2e2
+
+  %v1e7 = extractelement <8 x i8> %vec1, i64 7
+  %cv1e7 = zext i8 %v1e7 to i32
+  %v2e7 = extractelement <8 x i8> %vec2, i64 7
+  %cv2e7 = zext i8 %v2e7 to i32
+  %mul8 = mul nuw nsw i32 %cv1e7, %cv2e3
+
+  %acc = load i32, ptr addrspace(1) %dst, align 4
+  %mad11 = add i32 %mul1, %acc
+  %mad21 = add i32 %mad11, %mul3
+  %mad31 = add i32 %mad21, %mul5
+  %mad41 = add i32 %mad31, %mul7
+  %mad12 = add i32 %mul2, %mad41
+  %mad22 = add i32 %mad12, %mul4
+  %mad32 = add i32 %mad22, %mul6
+  %mad42 = add i32 %mad32, %mul8
+
+  store i32 %mad42, ptr addrspace(1) %dst, align 4
+  ret void
+}
+
 declare i32 @llvm.amdgcn.workitem.id.x()

>From 9a5c66abf23c1934f384d1d1a13a41525fde2488 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Wed, 23 Aug 2023 13:28:00 -0700
Subject: [PATCH 2/4] [AMDGPU] Accept arbitrary sized sources in
 CalculateByteProvider

This allows working with e.g. v8i8 / v16i8 sources.

It is generally useful, but is primarily beneficial when allowing e.g. v8i8s to be passed to branches directly through registers. As such, this is the first in a series of patches to enable that work. However, it effects https://reviews.llvm.org/D155995, so it has been implemented on top of that.

Differential Revision: https://reviews.llvm.org/D159036

Change-Id: I3b78398eda8011a417f851415490b11b63646233
---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     |  194 +--
 llvm/test/CodeGen/AMDGPU/idot4u.ll            | 1087 +++++++++++++++++
 .../CodeGen/AMDGPU/insert_vector_elt.v2i16.ll |   15 +-
 llvm/test/CodeGen/AMDGPU/load-hi16.ll         |   36 +-
 llvm/test/CodeGen/AMDGPU/permute_i8.ll        |  260 ++++
 5 files changed, 1492 insertions(+), 100 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index a620cbe239d8066..a6386c116fa7808 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -10658,8 +10658,7 @@ calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex = 0,
   if (Depth >= 6)
     return std::nullopt;
 
-  auto ValueSize = Op.getValueSizeInBits();
-  if (ValueSize != 8 && ValueSize != 16 && ValueSize != 32)
+  if (Op.getValueSizeInBits() < 8)
     return std::nullopt;
 
   switch (Op->getOpcode()) {
@@ -10919,8 +10918,6 @@ calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth,
     auto VecIdx = IdxOp->getZExtValue();
     auto ScalarSize = Op.getScalarValueSizeInBits();
     if (ScalarSize != 32) {
-      if ((VecIdx + 1) * ScalarSize > 32)
-        return std::nullopt;
       Index = ScalarSize == 8 ? VecIdx : VecIdx * 2 + Index;
     }
 
@@ -11006,9 +11003,6 @@ static bool hasNon16BitAccesses(uint64_t PermMask, SDValue &Op,
   int Low16 = PermMask & 0xffff;
   int Hi16 = (PermMask & 0xffff0000) >> 16;
 
-  assert(Op.getValueType().isByteSized());
-  assert(OtherOp.getValueType().isByteSized());
-
   auto TempOp = peekThroughBitcasts(Op);
   auto TempOtherOp = peekThroughBitcasts(OtherOp);
 
@@ -11026,6 +11020,31 @@ static bool hasNon16BitAccesses(uint64_t PermMask, SDValue &Op,
   return !addresses16Bits(Low16) || !addresses16Bits(Hi16);
 }
 
+static SDValue getDWordFromOffset(SelectionDAG &DAG, SDLoc SL, SDValue Src,
+                                  unsigned DWordOffset) {
+  SDValue Ret;
+  if (Src.getValueSizeInBits() <= 32)
+    return DAG.getBitcastedAnyExtOrTrunc(Src, SL, MVT::i32);
+
+  if (Src.getValueSizeInBits() >= 256) {
+    assert(!(Src.getValueSizeInBits() % 32));
+    Ret = DAG.getBitcast(
+        MVT::getVectorVT(MVT::i32, Src.getValueSizeInBits() / 32), Src);
+    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Ret,
+                       DAG.getConstant(DWordOffset, SL, MVT::i32));
+  }
+
+  Ret = DAG.getBitcastedAnyExtOrTrunc(
+      Src, SL, MVT::getIntegerVT(Src.getValueSizeInBits()));
+  if (DWordOffset) {
+    auto Shifted = DAG.getNode(ISD::SRL, SL, Ret.getValueType(), Ret,
+                               DAG.getConstant(DWordOffset * 32, SL, MVT::i32));
+    return DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Shifted);
+  }
+
+  return DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
+}
+
 SDValue SITargetLowering::performOrCombine(SDNode *N,
                                            DAGCombinerInfo &DCI) const {
   SelectionDAG &DAG = DCI.DAG;
@@ -11156,8 +11175,8 @@ SDValue SITargetLowering::performOrCombine(SDNode *N,
       if (PermNodes.size() != 4)
         return SDValue();
 
-      int FirstSrc = 0;
-      std::optional<int> SecondSrc;
+      std::pair<unsigned, unsigned> FirstSrc(0, PermNodes[0].SrcOffset / 4);
+      std::optional<std::pair<unsigned, unsigned>> SecondSrc;
       uint64_t PermMask = 0x00000000;
       for (size_t i = 0; i < PermNodes.size(); i++) {
         auto PermOp = PermNodes[i];
@@ -11165,27 +11184,31 @@ SDValue SITargetLowering::performOrCombine(SDNode *N,
         // by sizeof(Src2) = 4
         int SrcByteAdjust = 4;
 
-        if (!PermOp.hasSameSrc(PermNodes[FirstSrc])) {
-          if (SecondSrc.has_value())
-            if (!PermOp.hasSameSrc(PermNodes[*SecondSrc]))
+        // If the Src uses a byte from a different DWORD, then it corresponds
+        // with a difference source
+        if (!PermOp.hasSameSrc(PermNodes[FirstSrc.first]) ||
+            ((PermOp.SrcOffset / 4) != FirstSrc.second)) {
+          if (SecondSrc)
+            if (!PermOp.hasSameSrc(PermNodes[SecondSrc->first]) ||
+                ((PermOp.SrcOffset / 4) != SecondSrc->second))
               return SDValue();
 
           // Set the index of the second distinct Src node
-          SecondSrc = i;
-          assert(!(PermNodes[*SecondSrc].Src->getValueSizeInBits() % 8));
+          SecondSrc = {i, PermNodes[i].SrcOffset / 4};
+          assert(!(PermNodes[SecondSrc->first].Src->getValueSizeInBits() % 8));
           SrcByteAdjust = 0;
         }
-        assert(PermOp.SrcOffset + SrcByteAdjust < 8);
+        assert((PermOp.SrcOffset % 4) + SrcByteAdjust < 8);
         assert(!DAG.getDataLayout().isBigEndian());
-        PermMask |= (PermOp.SrcOffset + SrcByteAdjust) << (i * 8);
+        PermMask |= ((PermOp.SrcOffset % 4) + SrcByteAdjust) << (i * 8);
       }
-
-      SDValue Op = *PermNodes[FirstSrc].Src;
-      SDValue OtherOp = SecondSrc.has_value() ? *PermNodes[*SecondSrc].Src
-                                              : *PermNodes[FirstSrc].Src;
+      SDLoc DL(N);
+      SDValue Op = *PermNodes[FirstSrc.first].Src;
+      Op = getDWordFromOffset(DAG, DL, Op, FirstSrc.second);
+      assert(Op.getValueSizeInBits() == 32);
 
       // Check that we are not just extracting the bytes in order from an op
-      if (Op == OtherOp && Op.getValueSizeInBits() == 32) {
+      if (!SecondSrc) {
         int Low16 = PermMask & 0xffff;
         int Hi16 = (PermMask & 0xffff0000) >> 16;
 
@@ -11197,8 +11220,16 @@ SDValue SITargetLowering::performOrCombine(SDNode *N,
           return DAG.getBitcast(MVT::getIntegerVT(32), Op);
       }
 
+      SDValue OtherOp =
+          SecondSrc.has_value() ? *PermNodes[SecondSrc->first].Src : Op;
+
+      if (SecondSrc)
+        OtherOp = getDWordFromOffset(DAG, DL, OtherOp, SecondSrc->second);
+
+      assert(Op.getValueSizeInBits() == 32);
+
       if (hasNon16BitAccesses(PermMask, Op, OtherOp)) {
-        SDLoc DL(N);
+
         assert(Op.getValueType().isByteSized() &&
                OtherOp.getValueType().isByteSized());
 
@@ -12570,17 +12601,24 @@ static unsigned addPermMasks(unsigned First, unsigned Second) {
   return (FirstNoCs | SecondNoCs) | (FirstCs & SecondCs);
 }
 
+struct DotSrc {
+  SDValue SrcOp;
+  int64_t PermMask;
+  int64_t DWordOffset;
+};
+
 static void placeSources(ByteProvider<SDValue> &Src0,
                          ByteProvider<SDValue> &Src1,
-                         SmallVectorImpl<std::pair<SDValue, unsigned>> &Src0s,
-                         SmallVectorImpl<std::pair<SDValue, unsigned>> &Src1s,
-                         int Step) {
+                         SmallVectorImpl<DotSrc> &Src0s,
+                         SmallVectorImpl<DotSrc> &Src1s, int Step) {
 
   assert(Src0.Src.has_value() && Src1.Src.has_value());
   // Src0s and Src1s are empty, just place arbitrarily
   if (Step == 0) {
-    Src0s.push_back({*Src0.Src, (Src0.SrcOffset << 24) + 0x0c0c0c});
-    Src1s.push_back({*Src1.Src, (Src1.SrcOffset << 24) + 0x0c0c0c});
+    Src0s.push_back({*Src0.Src, ((Src0.SrcOffset % 4) << 24) + 0x0c0c0c,
+                     Src0.SrcOffset / 4});
+    Src1s.push_back({*Src1.Src, ((Src1.SrcOffset % 4) << 24) + 0x0c0c0c,
+                     Src1.SrcOffset / 4});
     return;
   }
 
@@ -12593,38 +12631,38 @@ static void placeSources(ByteProvider<SDValue> &Src0,
     unsigned FMask = 0xFF << (8 * (3 - Step));
 
     unsigned FirstMask =
-        BPP.first.SrcOffset << (8 * (3 - Step)) | (ZeroMask & ~FMask);
+        (BPP.first.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
     unsigned SecondMask =
-        BPP.second.SrcOffset << (8 * (3 - Step)) | (ZeroMask & ~FMask);
+        (BPP.second.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
     // Attempt to find Src vector which contains our SDValue, if so, add our
     // perm mask to the existing one. If we are unable to find a match for the
     // first SDValue, attempt to find match for the second.
     int FirstGroup = -1;
     for (int I = 0; I < 2; I++) {
-      SmallVectorImpl<std::pair<SDValue, unsigned>> &Srcs =
-          I == 0 ? Src0s : Src1s;
-      auto MatchesFirst = [&BPP](std::pair<SDValue, unsigned> IterElt) {
-        return IterElt.first == *BPP.first.Src;
+      SmallVectorImpl<DotSrc> &Srcs = I == 0 ? Src0s : Src1s;
+      auto MatchesFirst = [&BPP](DotSrc &IterElt) {
+        return IterElt.SrcOp == *BPP.first.Src &&
+               (IterElt.DWordOffset == (BPP.first.SrcOffset / 4));
       };
 
       auto Match = std::find_if(Srcs.begin(), Srcs.end(), MatchesFirst);
       if (Match != Srcs.end()) {
-        Match->second = addPermMasks(FirstMask, Match->second);
+        Match->PermMask = addPermMasks(FirstMask, Match->PermMask);
         FirstGroup = I;
         break;
       }
     }
     if (FirstGroup != -1) {
-      SmallVectorImpl<std::pair<SDValue, unsigned>> &Srcs =
-          FirstGroup == 1 ? Src0s : Src1s;
-      auto MatchesSecond = [&BPP](std::pair<SDValue, unsigned> IterElt) {
-        return IterElt.first == *BPP.second.Src;
+      SmallVectorImpl<DotSrc> &Srcs = FirstGroup == 1 ? Src0s : Src1s;
+      auto MatchesSecond = [&BPP](DotSrc &IterElt) {
+        return IterElt.SrcOp == *BPP.second.Src &&
+               (IterElt.DWordOffset == (BPP.second.SrcOffset / 4));
       };
       auto Match = std::find_if(Srcs.begin(), Srcs.end(), MatchesSecond);
       if (Match != Srcs.end()) {
-        Match->second = addPermMasks(SecondMask, Match->second);
+        Match->PermMask = addPermMasks(SecondMask, Match->PermMask);
       } else
-        Srcs.push_back({*BPP.second.Src, SecondMask});
+        Srcs.push_back({*BPP.second.Src, SecondMask, BPP.second.SrcOffset / 4});
       return;
     }
   }
@@ -12636,29 +12674,32 @@ static void placeSources(ByteProvider<SDValue> &Src0,
   unsigned FMask = 0xFF << (8 * (3 - Step));
 
   Src0s.push_back(
-      {*Src0.Src, (Src0.SrcOffset << (8 * (3 - Step)) | (ZeroMask & ~FMask))});
+      {*Src0.Src,
+       ((Src0.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
+       Src1.SrcOffset / 4});
   Src1s.push_back(
-      {*Src1.Src, (Src1.SrcOffset << (8 * (3 - Step)) | (ZeroMask & ~FMask))});
+      {*Src1.Src,
+       ((Src1.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
+       Src1.SrcOffset / 4});
 
   return;
 }
 
-static SDValue
-resolveSources(SelectionDAG &DAG, SDLoc SL,
-               SmallVectorImpl<std::pair<SDValue, unsigned>> &Srcs,
-               bool IsSigned, bool IsAny) {
+static SDValue resolveSources(SelectionDAG &DAG, SDLoc SL,
+                              SmallVectorImpl<DotSrc> &Srcs, bool IsSigned,
+                              bool IsAny) {
 
   // If we just have one source, just permute it accordingly.
   if (Srcs.size() == 1) {
     auto Elt = Srcs.begin();
-    auto EltVal = DAG.getBitcastedAnyExtOrTrunc(Elt->first, SL, MVT::i32);
+    auto EltOp = getDWordFromOffset(DAG, SL, Elt->SrcOp, Elt->DWordOffset);
 
     // v_perm will produce the original value
-    if (Elt->second == 0x3020100)
-      return EltVal;
+    if (Elt->PermMask == 0x3020100)
+      return EltOp;
 
-    return DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltVal, EltVal,
-                       DAG.getConstant(Elt->second, SL, MVT::i32));
+    return DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp,
+                       DAG.getConstant(Elt->PermMask, SL, MVT::i32));
   }
 
   auto FirstElt = Srcs.begin();
@@ -12669,8 +12710,8 @@ resolveSources(SelectionDAG &DAG, SDLoc SL,
   // If we have multiple sources in the chain, combine them via perms (using
   // calculated perm mask) and Ors.
   while (true) {
-    auto FirstMask = FirstElt->second;
-    auto SecondMask = SecondElt->second;
+    auto FirstMask = FirstElt->PermMask;
+    auto SecondMask = SecondElt->PermMask;
 
     unsigned FirstCs = FirstMask & 0x0c0c0c0c;
     unsigned FirstPlusFour = FirstMask | 0x04040404;
@@ -12680,9 +12721,9 @@ resolveSources(SelectionDAG &DAG, SDLoc SL,
 
     auto PermMask = addPermMasks(FirstMask, SecondMask);
     auto FirstVal =
-        DAG.getBitcastedAnyExtOrTrunc(FirstElt->first, SL, MVT::i32);
+        getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
     auto SecondVal =
-        DAG.getBitcastedAnyExtOrTrunc(SecondElt->first, SL, MVT::i32);
+        getDWordFromOffset(DAG, SL, SecondElt->SrcOp, SecondElt->DWordOffset);
 
     Perms.push_back(DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, FirstVal,
                                 SecondVal,
@@ -12696,12 +12737,12 @@ resolveSources(SelectionDAG &DAG, SDLoc SL,
     // If we only have a FirstElt, then just combine that into the cumulative
     // source node
     if (SecondElt == Srcs.end()) {
-      auto EltVal =
-          DAG.getBitcastedAnyExtOrTrunc(FirstElt->first, SL, MVT::i32);
+      auto EltOp =
+          getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
 
       Perms.push_back(
-          DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltVal, EltVal,
-                      DAG.getConstant(FirstElt->second, SL, MVT::i32)));
+          DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp,
+                      DAG.getConstant(FirstElt->PermMask, SL, MVT::i32)));
       break;
     }
   }
@@ -12712,9 +12753,8 @@ resolveSources(SelectionDAG &DAG, SDLoc SL,
              : Perms[0];
 }
 
-static void fixMasks(SmallVectorImpl<std::pair<SDValue, unsigned>> &Srcs,
-                     unsigned ChainLength) {
-  for (auto &[EntryVal, EntryMask] : Srcs) {
+static void fixMasks(SmallVectorImpl<DotSrc> &Srcs, unsigned ChainLength) {
+  for (auto &[EntryVal, EntryMask, EntryOffset] : Srcs) {
     EntryMask = EntryMask >> ((4 - ChainLength) * 8);
     auto ZeroMask = ChainLength == 2 ? 0x0c0c0000 : 0x0c000000;
     EntryMask += ZeroMask;
@@ -12754,8 +12794,8 @@ SDValue SITargetLowering::performAddCombine(SDNode *N,
 
     auto MulOpcode = TempNode.getOperand(MulIdx).getOpcode();
     std::optional<bool> IsSigned;
-    SmallVector<std::pair<SDValue, unsigned>, 4> Src0s;
-    SmallVector<std::pair<SDValue, unsigned>, 4> Src1s;
+    SmallVector<DotSrc, 4> Src0s;
+    SmallVector<DotSrc, 4> Src1s;
     SmallVector<SDValue, 4> Src2s;
 
     // Match the v_dot4 tree, while collecting src nodes.
@@ -12844,11 +12884,11 @@ SDValue SITargetLowering::performAddCombine(SDNode *N,
     // (commutation)
     bool UseOriginalSrc = false;
     if (ChainLength == 4 && Src0s.size() == 1 && Src1s.size() == 1 &&
-        Src0s.begin()->second == Src1s.begin()->second &&
-        Src0s.begin()->first.getValueSizeInBits() == 32 &&
-        Src1s.begin()->first.getValueSizeInBits() == 32) {
+        Src0s.begin()->PermMask == Src1s.begin()->PermMask &&
+        Src0s.begin()->SrcOp.getValueSizeInBits() >= 32 &&
+        Src1s.begin()->SrcOp.getValueSizeInBits() >= 32) {
       SmallVector<unsigned, 4> SrcBytes;
-      auto Src0Mask = Src0s.begin()->second;
+      auto Src0Mask = Src0s.begin()->PermMask;
       SrcBytes.push_back(Src0Mask & 0xFF000000);
       bool UniqueEntries = true;
       for (auto I = 1; I < 4; I++) {
@@ -12863,11 +12903,19 @@ SDValue SITargetLowering::performAddCombine(SDNode *N,
 
       if (UniqueEntries) {
         UseOriginalSrc = true;
-        // Must be 32 bits to enter above conditional
-        assert(Src0s.begin()->first.getValueSizeInBits() == 32);
-        assert(Src1s.begin()->first.getValueSizeInBits() == 32);
-        Src0 = DAG.getBitcast(MVT::getIntegerVT(32), Src0s.begin()->first);
-        Src1 = DAG.getBitcast(MVT::getIntegerVT(32), Src1s.begin()->first);
+
+        auto FirstElt = Src0s.begin();
+        auto FirstEltOp =
+            getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
+
+        auto SecondElt = Src1s.begin();
+        auto SecondEltOp = getDWordFromOffset(DAG, SL, SecondElt->SrcOp,
+                                              SecondElt->DWordOffset);
+
+        Src0 = DAG.getBitcastedAnyExtOrTrunc(FirstEltOp, SL,
+                                             MVT::getIntegerVT(32));
+        Src1 = DAG.getBitcastedAnyExtOrTrunc(SecondEltOp, SL,
+                                             MVT::getIntegerVT(32));
       }
     }
 
diff --git a/llvm/test/CodeGen/AMDGPU/idot4u.ll b/llvm/test/CodeGen/AMDGPU/idot4u.ll
index a82c5215f3b2c65..e6b6a0bedd72e99 100644
--- a/llvm/test/CodeGen/AMDGPU/idot4u.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot4u.ll
@@ -4622,4 +4622,1091 @@ entry:
   ret void
 }
 
+define amdgpu_kernel void @idot4_acc32_hilo(ptr addrspace(1) %src1,
+; GFX7-LABEL: idot4_acc32_hilo:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    s_mov_b32 s10, 0
+; GFX7-NEXT:    s_mov_b32 s11, s3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v1, 0
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_mov_b64 s[6:7], s[10:11]
+; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 offset:4
+; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_bfe_u32 v4, v2, 8, 8
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_bfe_u32 v5, v0, 8, 8
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xff, v2
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xff, v0
+; GFX7-NEXT:    v_mul_u32_u24_e32 v4, v4, v5
+; GFX7-NEXT:    v_bfe_u32 v6, v2, 16, 8
+; GFX7-NEXT:    v_bfe_u32 v7, v0, 16, 8
+; GFX7-NEXT:    v_mad_u32_u24 v1, v1, v3, v4
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
+; GFX7-NEXT:    v_mad_u32_u24 v1, v6, v7, v1
+; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, v1
+; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: idot4_acc32_hilo:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s4, v0
+; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 4, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
+; GFX8-NEXT:    flat_load_dword v2, v[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    s_waitcnt vmcnt(1)
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xff, v4
+; GFX8-NEXT:    v_bfe_u32 v5, v4, 16, 8
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xff, v2
+; GFX8-NEXT:    v_mul_u32_u24_sdwa v7, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
+; GFX8-NEXT:    v_bfe_u32 v8, v2, 16, 8
+; GFX8-NEXT:    v_mad_u32_u24 v3, v6, v3, v7
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 24, v4
+; GFX8-NEXT:    v_mad_u32_u24 v3, v8, v5, v3
+; GFX8-NEXT:    v_mad_u32_u24 v2, v2, v4, v3
+; GFX8-NEXT:    flat_store_dword v[0:1], v2
+; GFX8-NEXT:    s_endpgm
+;
+; GFX9-NODL-LABEL: idot4_acc32_hilo:
+; GFX9-NODL:       ; %bb.0: ; %entry
+; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NODL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[4:5] offset:4
+; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[6:7]
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NODL-NEXT:    v_and_b32_e32 v3, 0xff, v1
+; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NODL-NEXT:    v_and_b32_e32 v4, 0xff, v2
+; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
+; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v6, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
+; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
+; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, v3, v4, v5
+; GFX9-NODL-NEXT:    v_add3_u32 v1, v2, v6, v1
+; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX9-NODL-NEXT:    s_endpgm
+;
+; GFX9-DL-LABEL: idot4_acc32_hilo:
+; GFX9-DL:       ; %bb.0: ; %entry
+; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5] offset:4
+; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-DL-NEXT:    v_dot4_u32_u8 v1, v1, v2, 0
+; GFX9-DL-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX9-DL-NEXT:    s_endpgm
+;
+; GFX10-DL-LABEL: idot4_acc32_hilo:
+; GFX10-DL:       ; %bb.0: ; %entry
+; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_clause 0x1
+; GFX10-DL-NEXT:    global_load_dword v1, v0, s[4:5] offset:4
+; GFX10-DL-NEXT:    global_load_dword v2, v0, s[6:7]
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DL-NEXT:    v_dot4_u32_u8 v1, v1, v2, 0
+; GFX10-DL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-DL-NEXT:    s_endpgm
+;
+; GFX11-DL-LABEL: idot4_acc32_hilo:
+; GFX11-DL:       ; %bb.0: ; %entry
+; GFX11-DL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; GFX11-DL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-DL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_clause 0x1
+; GFX11-DL-NEXT:    global_load_b32 v1, v0, s[4:5] offset:4
+; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[6:7]
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DL-NEXT:    v_dot4_u32_u8 v0, v1, v0, 0
+; GFX11-DL-NEXT:    global_store_b32 v2, v0, s[0:1]
+; GFX11-DL-NEXT:    s_nop 0
+; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DL-NEXT:    s_endpgm
+                                       ptr addrspace(1) %src2,
+                                       ptr addrspace(1) nocapture %dst) {
+entry:
+  %idx = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx
+  %vec1 = load <8 x i8>, ptr addrspace(1) %gep1
+  %gep2 = getelementptr <8 x i8>, ptr addrspace(1) %src2, i32 %idx
+  %vec2 = load <8 x i8>, ptr addrspace(1) %gep2
+
+  %v1e0 = extractelement <8 x i8> %vec1, i64 4
+  %cv1e0 = zext i8 %v1e0 to i32
+  %v2e0 = extractelement <8 x i8> %vec2, i64 0
+  %cv2e0 = zext i8 %v2e0 to i32
+  %mul1 = mul nuw nsw i32 %cv1e0, %cv2e0
+
+  %v1e1 = extractelement <8 x i8> %vec1, i64 5
+  %cv1e1 = zext i8 %v1e1 to i32
+  %v2e1 = extractelement <8 x i8> %vec2, i64 1
+  %cv2e1 = zext i8 %v2e1 to i32
+  %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1
+
+  %v1e2 = extractelement <8 x i8> %vec1, i64 6
+  %cv1e2 = zext i8 %v1e2 to i32
+  %v2e2 = extractelement <8 x i8> %vec2, i64 2
+  %cv2e2 = zext i8 %v2e2 to i32
+  %mul3 = mul nuw nsw i32 %cv1e2, %cv2e2
+
+  %v1e3 = extractelement <8 x i8> %vec1, i64 7
+  %cv1e3 = zext i8 %v1e3 to i32
+  %v2e3 = extractelement <8 x i8> %vec2, i64 3
+  %cv2e3 = zext i8 %v2e3 to i32
+  %mul4 = mul nuw nsw i32 %cv1e3, %cv2e3
+
+  %add1 = add i32 %mul1, 0
+  %add2 = add i32 %add1, %mul2
+  %add3 = add i32 %add2, %mul3
+  %add4 = add i32 %add3, %mul4
+  store i32 %add4, ptr addrspace(1) %dst, align 4
+  ret void
+}
+
+define amdgpu_kernel void @idot4_acc32_lohi(ptr addrspace(1) %src1,
+; GFX7-LABEL: idot4_acc32_lohi:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    s_mov_b32 s10, 0
+; GFX7-NEXT:    s_mov_b32 s11, s3
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v1, 0
+; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 offset:4
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_bfe_u32 v3, v2, 8, 8
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xff, v2
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_bfe_u32 v6, v0, 16, 8
+; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 24, v0
+; GFX7-NEXT:    v_mul_u32_u24_e32 v3, v3, v6
+; GFX7-NEXT:    v_bfe_u32 v4, v2, 16, 8
+; GFX7-NEXT:    v_bfe_u32 v7, v0, 8, 8
+; GFX7-NEXT:    v_mad_u32_u24 v1, v1, v5, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX7-NEXT:    v_mad_u32_u24 v1, v4, v7, v1
+; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, v1
+; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: idot4_acc32_lohi:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v3, s7
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s6, v2
+; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 4, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
+; GFX8-NEXT:    flat_load_dword v2, v[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    s_waitcnt vmcnt(1)
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xff, v4
+; GFX8-NEXT:    v_bfe_u32 v5, v4, 16, 8
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 24, v4
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 24, v2
+; GFX8-NEXT:    v_mul_u32_u24_sdwa v4, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_2
+; GFX8-NEXT:    v_bfe_u32 v8, v2, 8, 8
+; GFX8-NEXT:    v_mad_u32_u24 v3, v3, v7, v4
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX8-NEXT:    v_mad_u32_u24 v3, v5, v8, v3
+; GFX8-NEXT:    v_mad_u32_u24 v2, v6, v2, v3
+; GFX8-NEXT:    flat_store_dword v[0:1], v2
+; GFX8-NEXT:    s_endpgm
+;
+; GFX9-NODL-LABEL: idot4_acc32_lohi:
+; GFX9-NODL:       ; %bb.0: ; %entry
+; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NODL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[4:5]
+; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[6:7] offset:4
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NODL-NEXT:    v_and_b32_e32 v3, 0xff, v1
+; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v4, 24, v2
+; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_2
+; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v6, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_1
+; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_0
+; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, v3, v4, v5
+; GFX9-NODL-NEXT:    v_add3_u32 v1, v2, v6, v1
+; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX9-NODL-NEXT:    s_endpgm
+;
+; GFX9-DL-LABEL: idot4_acc32_lohi:
+; GFX9-DL:       ; %bb.0: ; %entry
+; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; GFX9-DL-NEXT:    s_mov_b32 s0, 0x10302
+; GFX9-DL-NEXT:    s_mov_b32 s1, 0x3020001
+; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT:    global_load_dword v1, v0, s[6:7] offset:4
+; GFX9-DL-NEXT:    global_load_dword v2, v0, s[4:5]
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-DL-NEXT:    v_perm_b32 v1, v1, v1, s0
+; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-DL-NEXT:    v_perm_b32 v2, v2, v2, s1
+; GFX9-DL-NEXT:    v_dot4_u32_u8 v1, v2, v1, 0
+; GFX9-DL-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX9-DL-NEXT:    s_endpgm
+;
+; GFX10-DL-LABEL: idot4_acc32_lohi:
+; GFX10-DL:       ; %bb.0: ; %entry
+; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_clause 0x1
+; GFX10-DL-NEXT:    global_load_dword v1, v0, s[6:7] offset:4
+; GFX10-DL-NEXT:    global_load_dword v2, v0, s[4:5]
+; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-DL-NEXT:    v_perm_b32 v0, v1, v1, 0x10302
+; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DL-NEXT:    v_perm_b32 v1, v2, v2, 0x3020001
+; GFX10-DL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-DL-NEXT:    v_dot4_u32_u8 v0, v1, v0, 0
+; GFX10-DL-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-DL-NEXT:    s_endpgm
+;
+; GFX11-DL-LABEL: idot4_acc32_lohi:
+; GFX11-DL:       ; %bb.0: ; %entry
+; GFX11-DL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; GFX11-DL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-DL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_clause 0x1
+; GFX11-DL-NEXT:    global_load_b32 v1, v0, s[6:7] offset:4
+; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[4:5]
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-DL-NEXT:    v_perm_b32 v1, v1, v1, 0x10302
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DL-NEXT:    v_perm_b32 v0, v0, v0, 0x3020001
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DL-NEXT:    v_dot4_u32_u8 v0, v0, v1, 0
+; GFX11-DL-NEXT:    global_store_b32 v2, v0, s[0:1]
+; GFX11-DL-NEXT:    s_nop 0
+; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DL-NEXT:    s_endpgm
+                                       ptr addrspace(1) %src2,
+                                       ptr addrspace(1) nocapture %dst) {
+entry:
+  %idx = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx
+  %vec1 = load <8 x i8>, ptr addrspace(1) %gep1
+  %gep2 = getelementptr <8 x i8>, ptr addrspace(1) %src2, i32 %idx
+  %vec2 = load <8 x i8>, ptr addrspace(1) %gep2
+
+  %v1e0 = extractelement <8 x i8> %vec1, i64 0
+  %cv1e0 = zext i8 %v1e0 to i32
+  %v2e0 = extractelement <8 x i8> %vec2, i64 7
+  %cv2e0 = zext i8 %v2e0 to i32
+  %mul1 = mul nuw nsw i32 %cv1e0, %cv2e0
+
+  %v1e1 = extractelement <8 x i8> %vec1, i64 1
+  %cv1e1 = zext i8 %v1e1 to i32
+  %v2e1 = extractelement <8 x i8> %vec2, i64 6
+  %cv2e1 = zext i8 %v2e1 to i32
+  %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1
+
+  %v1e2 = extractelement <8 x i8> %vec1, i64 2
+  %cv1e2 = zext i8 %v1e2 to i32
+  %v2e2 = extractelement <8 x i8> %vec2, i64 5
+  %cv2e2 = zext i8 %v2e2 to i32
+  %mul3 = mul nuw nsw i32 %cv1e2, %cv2e2
+
+  %v1e3 = extractelement <8 x i8> %vec1, i64 3
+  %cv1e3 = zext i8 %v1e3 to i32
+  %v2e3 = extractelement <8 x i8> %vec2, i64 4
+  %cv2e3 = zext i8 %v2e3 to i32
+  %mul4 = mul nuw nsw i32 %cv1e3, %cv2e3
+
+  %add1 = add i32 %mul1, 0
+  %add2 = add i32 %add1, %mul2
+  %add3 = add i32 %add2, %mul3
+  %add4 = add i32 %add3, %mul4
+  store i32 %add4, ptr addrspace(1) %dst, align 4
+  ret void
+}
+
+define amdgpu_kernel void @idot4_acc32_hihi(ptr addrspace(1) %src1,
+; GFX7-LABEL: idot4_acc32_hihi:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v1, 0
+; GFX7-NEXT:    s_mov_b32 s10, 0
+; GFX7-NEXT:    s_mov_b32 s11, s3
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 offset:4
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 offset:4
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_bfe_u32 v3, v2, 16, 8
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xff, v2
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xff, v0
+; GFX7-NEXT:    v_bfe_u32 v5, v0, 16, 8
+; GFX7-NEXT:    v_mul_u32_u24_e32 v3, v3, v6
+; GFX7-NEXT:    v_bfe_u32 v4, v2, 8, 8
+; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 24, v0
+; GFX7-NEXT:    v_mad_u32_u24 v1, v1, v5, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
+; GFX7-NEXT:    v_bfe_u32 v0, v0, 8, 8
+; GFX7-NEXT:    v_mad_u32_u24 v1, v4, v7, v1
+; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, v1
+; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: idot4_acc32_hihi:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s4, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v3, s7
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, s6, v0
+; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 4, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v2, v[0:1]
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 4, v4
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    s_waitcnt vmcnt(1)
+; GFX8-NEXT:    v_and_b32_e32 v4, 0xff, v2
+; GFX8-NEXT:    v_bfe_u32 v7, v2, 8, 8
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_bfe_u32 v5, v3, 16, 8
+; GFX8-NEXT:    v_mul_u32_u24_sdwa v6, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 24, v3
+; GFX8-NEXT:    v_mad_u32_u24 v4, v4, v5, v6
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
+; GFX8-NEXT:    v_bfe_u32 v3, v3, 8, 8
+; GFX8-NEXT:    v_mad_u32_u24 v4, v7, v8, v4
+; GFX8-NEXT:    v_mad_u32_u24 v2, v2, v3, v4
+; GFX8-NEXT:    flat_store_dword v[0:1], v2
+; GFX8-NEXT:    s_endpgm
+;
+; GFX9-NODL-LABEL: idot4_acc32_hihi:
+; GFX9-NODL:       ; %bb.0: ; %entry
+; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NODL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[4:5] offset:4
+; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[6:7] offset:4
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NODL-NEXT:    v_and_b32_e32 v3, 0xff, v1
+; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NODL-NEXT:    v_bfe_u32 v4, v2, 16, 8
+; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_0
+; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v6, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_3
+; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_1
+; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, v3, v4, v5
+; GFX9-NODL-NEXT:    v_add3_u32 v1, v2, v6, v1
+; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX9-NODL-NEXT:    s_endpgm
+;
+; GFX9-DL-LABEL: idot4_acc32_hihi:
+; GFX9-DL:       ; %bb.0: ; %entry
+; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; GFX9-DL-NEXT:    s_mov_b32 s0, 0x1030200
+; GFX9-DL-NEXT:    s_mov_b32 s1, 0x3010002
+; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT:    global_load_dword v1, v0, s[6:7] offset:4
+; GFX9-DL-NEXT:    global_load_dword v2, v0, s[4:5] offset:4
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-DL-NEXT:    v_perm_b32 v1, v1, v1, s0
+; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-DL-NEXT:    v_perm_b32 v2, v2, v2, s1
+; GFX9-DL-NEXT:    v_dot4_u32_u8 v1, v2, v1, 0
+; GFX9-DL-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX9-DL-NEXT:    s_endpgm
+;
+; GFX10-DL-LABEL: idot4_acc32_hihi:
+; GFX10-DL:       ; %bb.0: ; %entry
+; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_clause 0x1
+; GFX10-DL-NEXT:    global_load_dword v1, v0, s[6:7] offset:4
+; GFX10-DL-NEXT:    global_load_dword v2, v0, s[4:5] offset:4
+; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-DL-NEXT:    v_perm_b32 v0, v1, v1, 0x1030200
+; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DL-NEXT:    v_perm_b32 v1, v2, v2, 0x3010002
+; GFX10-DL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-DL-NEXT:    v_dot4_u32_u8 v0, v1, v0, 0
+; GFX10-DL-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-DL-NEXT:    s_endpgm
+;
+; GFX11-DL-LABEL: idot4_acc32_hihi:
+; GFX11-DL:       ; %bb.0: ; %entry
+; GFX11-DL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; GFX11-DL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-DL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_clause 0x1
+; GFX11-DL-NEXT:    global_load_b32 v1, v0, s[6:7] offset:4
+; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[4:5] offset:4
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-DL-NEXT:    v_perm_b32 v1, v1, v1, 0x1030200
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DL-NEXT:    v_perm_b32 v0, v0, v0, 0x3010002
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DL-NEXT:    v_dot4_u32_u8 v0, v0, v1, 0
+; GFX11-DL-NEXT:    global_store_b32 v2, v0, s[0:1]
+; GFX11-DL-NEXT:    s_nop 0
+; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DL-NEXT:    s_endpgm
+                                       ptr addrspace(1) %src2,
+                                       ptr addrspace(1) nocapture %dst) {
+entry:
+  %idx = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx
+  %vec1 = load <8 x i8>, ptr addrspace(1) %gep1
+  %gep2 = getelementptr <8 x i8>, ptr addrspace(1) %src2, i32 %idx
+  %vec2 = load <8 x i8>, ptr addrspace(1) %gep2
+
+  %v1e0 = extractelement <8 x i8> %vec1, i64 4
+  %cv1e0 = zext i8 %v1e0 to i32
+  %v2e0 = extractelement <8 x i8> %vec2, i64 6
+  %cv2e0 = zext i8 %v2e0 to i32
+  %mul1 = mul nuw nsw i32 %cv1e0, %cv2e0
+
+  %v1e1 = extractelement <8 x i8> %vec1, i64 6
+  %cv1e1 = zext i8 %v1e1 to i32
+  %v2e1 = extractelement <8 x i8> %vec2, i64 4
+  %cv2e1 = zext i8 %v2e1 to i32
+  %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1
+
+  %v1e2 = extractelement <8 x i8> %vec1, i64 5
+  %cv1e2 = zext i8 %v1e2 to i32
+  %v2e2 = extractelement <8 x i8> %vec2, i64 7
+  %cv2e2 = zext i8 %v2e2 to i32
+  %mul3 = mul nuw nsw i32 %cv1e2, %cv2e2
+
+  %v1e3 = extractelement <8 x i8> %vec1, i64 7
+  %cv1e3 = zext i8 %v1e3 to i32
+  %v2e3 = extractelement <8 x i8> %vec2, i64 5
+  %cv2e3 = zext i8 %v2e3 to i32
+  %mul4 = mul nuw nsw i32 %cv1e3, %cv2e3
+
+  %add1 = add i32 %mul1, 0
+  %add2 = add i32 %add1, %mul2
+  %add3 = add i32 %add2, %mul3
+  %add4 = add i32 %add3, %mul4
+  store i32 %add4, ptr addrspace(1) %dst, align 4
+  ret void
+}
+
+define amdgpu_kernel void @idot4_acc32_v8i8(ptr addrspace(1) %src1,
+; GFX7-LABEL: idot4_acc32_v8i8:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, s3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v1, 0
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_bfe_u32 v4, v0, 8, 8
+; GFX7-NEXT:    v_bfe_u32 v5, v1, 8, 8
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xff, v0
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xff, v1
+; GFX7-NEXT:    v_mul_u32_u24_e32 v4, v4, v5
+; GFX7-NEXT:    v_bfe_u32 v6, v0, 16, 8
+; GFX7-NEXT:    v_bfe_u32 v7, v1, 16, 8
+; GFX7-NEXT:    v_mad_u32_u24 v2, v2, v3, v4
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 24, v1
+; GFX7-NEXT:    v_mad_u32_u24 v2, v6, v7, v2
+; GFX7-NEXT:    v_mad_u32_u24 v0, v0, v1, v2
+; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: idot4_acc32_v8i8:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v1, s3
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xff, v0
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xff, v1
+; GFX8-NEXT:    v_mul_u32_u24_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
+; GFX8-NEXT:    v_bfe_u32 v5, v0, 16, 8
+; GFX8-NEXT:    v_bfe_u32 v6, v1, 16, 8
+; GFX8-NEXT:    v_mad_u32_u24 v2, v2, v3, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 24, v1
+; GFX8-NEXT:    v_mad_u32_u24 v2, v5, v6, v2
+; GFX8-NEXT:    v_mad_u32_u24 v2, v0, v1, v2
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    flat_store_dword v[0:1], v2
+; GFX8-NEXT:    s_endpgm
+;
+; GFX9-NODL-LABEL: idot4_acc32_v8i8:
+; GFX9-NODL:       ; %bb.0: ; %entry
+; GFX9-NODL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NODL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
+; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NODL-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3]
+; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NODL-NEXT:    v_and_b32_e32 v3, 0xff, v0
+; GFX9-NODL-NEXT:    v_and_b32_e32 v4, 0xff, v1
+; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
+; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
+; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
+; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, v3, v4, v5
+; GFX9-NODL-NEXT:    v_add3_u32 v0, v1, v6, v0
+; GFX9-NODL-NEXT:    global_store_dword v2, v0, s[4:5]
+; GFX9-NODL-NEXT:    s_endpgm
+;
+; GFX9-DL-LABEL: idot4_acc32_v8i8:
+; GFX9-DL:       ; %bb.0: ; %entry
+; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
+; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; GFX9-DL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3]
+; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-DL-NEXT:    v_dot4_u32_u8 v0, v0, v1, 0
+; GFX9-DL-NEXT:    global_store_dword v2, v0, s[4:5]
+; GFX9-DL-NEXT:    s_endpgm
+;
+; GFX10-DL-LABEL: idot4_acc32_v8i8:
+; GFX10-DL:       ; %bb.0: ; %entry
+; GFX10-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-DL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3]
+; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DL-NEXT:    v_dot4_u32_u8 v0, v0, v1, 0
+; GFX10-DL-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-DL-NEXT:    s_endpgm
+;
+; GFX11-DL-LABEL: idot4_acc32_v8i8:
+; GFX11-DL:       ; %bb.0: ; %entry
+; GFX11-DL-NEXT:    s_load_b64 s[2:3], s[0:1], 0x24
+; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; GFX11-DL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-DL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    global_load_b64 v[0:1], v0, s[2:3]
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DL-NEXT:    v_dot4_u32_u8 v0, v0, v1, 0
+; GFX11-DL-NEXT:    global_store_b32 v2, v0, s[0:1]
+; GFX11-DL-NEXT:    s_nop 0
+; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DL-NEXT:    s_endpgm
+                                       ptr addrspace(1) %src2,
+                                       ptr addrspace(1) nocapture %dst) {
+entry:
+  %idx = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx
+  %vec1 = load <8 x i8>, ptr addrspace(1) %gep1
+
+
+  %v1e0 = extractelement <8 x i8> %vec1, i64 0
+  %cv1e0 = zext i8 %v1e0 to i32
+  %v2e0 = extractelement <8 x i8> %vec1, i64 4
+  %cv2e0 = zext i8 %v2e0 to i32
+  %mul1 = mul nuw nsw i32 %cv1e0, %cv2e0
+
+  %v1e1 = extractelement <8 x i8> %vec1, i64 1
+  %cv1e1 = zext i8 %v1e1 to i32
+  %v2e1 = extractelement <8 x i8> %vec1, i64 5
+  %cv2e1 = zext i8 %v2e1 to i32
+  %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1
+
+  %v1e2 = extractelement <8 x i8> %vec1, i64 2
+  %cv1e2 = zext i8 %v1e2 to i32
+  %v2e2 = extractelement <8 x i8> %vec1, i64 6
+  %cv2e2 = zext i8 %v2e2 to i32
+  %mul3 = mul nuw nsw i32 %cv1e2, %cv2e2
+
+  %v1e3 = extractelement <8 x i8> %vec1, i64 3
+  %cv1e3 = zext i8 %v1e3 to i32
+  %v2e3 = extractelement <8 x i8> %vec1, i64 7
+  %cv2e3 = zext i8 %v2e3 to i32
+  %mul4 = mul nuw nsw i32 %cv1e3, %cv2e3
+
+  %add1 = add i32 %mul1, 0
+  %add2 = add i32 %add1, %mul2
+  %add3 = add i32 %add2, %mul3
+  %add4 = add i32 %add3, %mul4
+  store i32 %add4, ptr addrspace(1) %dst, align 4
+  ret void
+}
+
+define amdgpu_kernel void @idot4_acc32_v16i8(ptr addrspace(1) %src1,
+; GFX7-LABEL: idot4_acc32_v16i8:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    s_mov_b32 s10, 0
+; GFX7-NEXT:    s_mov_b32 s11, s3
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 4, v0
+; GFX7-NEXT:    v_mov_b32_e32 v2, 0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_mov_b64 s[6:7], s[10:11]
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v5, v2
+; GFX7-NEXT:    buffer_load_dwordx4 v[0:3], v[1:2], s[8:11], 0 addr64
+; GFX7-NEXT:    buffer_load_dword v0, v[4:5], s[4:7], 0 addr64
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xff, v2
+; GFX7-NEXT:    v_bfe_u32 v2, v2, 16, 8
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_bfe_u32 v5, v0, 8, 8
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xff, v0
+; GFX7-NEXT:    v_mul_u32_u24_e32 v2, v2, v5
+; GFX7-NEXT:    v_bfe_u32 v6, v3, 8, 8
+; GFX7-NEXT:    v_bfe_u32 v7, v0, 16, 8
+; GFX7-NEXT:    v_mad_u32_u24 v1, v1, v4, v2
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 24, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
+; GFX7-NEXT:    v_mad_u32_u24 v1, v6, v7, v1
+; GFX7-NEXT:    v_mad_u32_u24 v0, v3, v0, v1
+; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: idot4_acc32_v16i8:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 4, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v2, s5
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, s4, v1
+; GFX8-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v3, s7
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, s6, v0
+; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, 0, v3, vcc
+; GFX8-NEXT:    flat_load_dwordx4 v[0:3], v[1:2]
+; GFX8-NEXT:    flat_load_dword v4, v[4:5]
+; GFX8-NEXT:    s_waitcnt vmcnt(1)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xff, v2
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xff, v4
+; GFX8-NEXT:    v_mul_u32_u24_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_1
+; GFX8-NEXT:    v_bfe_u32 v7, v3, 8, 8
+; GFX8-NEXT:    v_bfe_u32 v8, v4, 16, 8
+; GFX8-NEXT:    v_mad_u32_u24 v2, v5, v6, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 24, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 24, v4
+; GFX8-NEXT:    v_mad_u32_u24 v2, v7, v8, v2
+; GFX8-NEXT:    v_mad_u32_u24 v2, v3, v4, v2
+; GFX8-NEXT:    flat_store_dword v[0:1], v2
+; GFX8-NEXT:    s_endpgm
+;
+; GFX9-NODL-LABEL: idot4_acc32_v16i8:
+; GFX9-NODL:       ; %bb.0: ; %entry
+; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NODL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
+; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v5, 3, v0
+; GFX9-NODL-NEXT:    ; kill: killed $vgpr5
+; GFX9-NODL-NEXT:    ; kill: killed $vgpr4
+; GFX9-NODL-NEXT:    ; kill: killed $sgpr4_sgpr5_sgpr6 killed $sgpr7
+; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NODL-NEXT:    global_load_dwordx4 v[0:3], v4, s[4:5]
+; GFX9-NODL-NEXT:    global_load_dword v0, v5, s[6:7]
+; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NODL-NEXT:    v_and_b32_e32 v4, 0xff, v2
+; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NODL-NEXT:    v_and_b32_e32 v5, 0xff, v0
+; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_1
+; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v6, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_2
+; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
+; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, v4, v5, v2
+; GFX9-NODL-NEXT:    v_add3_u32 v0, v2, v6, v0
+; GFX9-NODL-NEXT:    global_store_dword v1, v0, s[2:3]
+; GFX9-NODL-NEXT:    s_endpgm
+;
+; GFX9-DL-LABEL: idot4_acc32_v16i8:
+; GFX9-DL:       ; %bb.0: ; %entry
+; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
+; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v5, 3, v0
+; GFX9-DL-NEXT:    s_mov_b32 s0, 0x7050002
+; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT:    global_load_dwordx4 v[0:3], v4, s[4:5]
+; GFX9-DL-NEXT:    global_load_dword v0, v5, s[6:7]
+; GFX9-DL-NEXT:    s_mov_b32 s1, 0x3020001
+; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-DL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-DL-NEXT:    ; kill: killed $vgpr5
+; GFX9-DL-NEXT:    ; kill: killed $vgpr4
+; GFX9-DL-NEXT:    ; kill: killed $sgpr4_sgpr5_sgpr6 killed $sgpr7
+; GFX9-DL-NEXT:    v_perm_b32 v2, v3, v2, s0
+; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-DL-NEXT:    v_perm_b32 v0, v0, v0, s1
+; GFX9-DL-NEXT:    v_dot4_u32_u8 v0, v2, v0, 0
+; GFX9-DL-NEXT:    global_store_dword v1, v0, s[2:3]
+; GFX9-DL-NEXT:    s_endpgm
+;
+; GFX10-DL-LABEL: idot4_acc32_v16i8:
+; GFX10-DL:       ; %bb.0: ; %entry
+; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
+; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v5, 3, v0
+; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-DL-NEXT:    ; kill: killed $vgpr5
+; GFX10-DL-NEXT:    ; kill: killed $vgpr4
+; GFX10-DL-NEXT:    ; kill: killed $sgpr4_sgpr5_sgpr6 killed $sgpr7
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    global_load_dwordx4 v[0:3], v4, s[4:5]
+; GFX10-DL-NEXT:    global_load_dword v0, v5, s[6:7]
+; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-DL-NEXT:    v_perm_b32 v1, v3, v2, 0x7050002
+; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DL-NEXT:    v_perm_b32 v0, v0, v0, 0x3020001
+; GFX10-DL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-DL-NEXT:    v_dot4_u32_u8 v0, v1, v0, 0
+; GFX10-DL-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-DL-NEXT:    s_endpgm
+;
+; GFX11-DL-LABEL: idot4_acc32_v16i8:
+; GFX11-DL:       ; %bb.0: ; %entry
+; GFX11-DL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v1, 4, v0
+; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
+; GFX11-DL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    global_load_b128 v[0:3], v1, s[4:5]
+; GFX11-DL-NEXT:    global_load_b32 v0, v4, s[6:7]
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-DL-NEXT:    v_perm_b32 v1, v3, v2, 0x7050002
+; GFX11-DL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DL-NEXT:    v_perm_b32 v0, v0, v0, 0x3020001
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DL-NEXT:    v_dot4_u32_u8 v0, v1, v0, 0
+; GFX11-DL-NEXT:    global_store_b32 v2, v0, s[0:1]
+; GFX11-DL-NEXT:    s_nop 0
+; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DL-NEXT:    s_endpgm
+                                       ptr addrspace(1) %src2,
+                                       ptr addrspace(1) nocapture %dst) {
+entry:
+  %idx = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep1 = getelementptr <16 x i8>, ptr addrspace(1) %src1, i32 %idx
+  %vec1 = load <16 x i8>, ptr addrspace(1) %gep1
+  %gep2 = getelementptr <8 x i8>, ptr addrspace(1) %src2, i32 %idx
+  %vec2 = load <8 x i8>, ptr addrspace(1) %gep2
+
+  %v1e0 = extractelement <16 x i8> %vec1, i64 8
+  %cv1e0 = zext i8 %v1e0 to i32
+  %v2e0 = extractelement <8 x i8> %vec2, i64 0
+  %cv2e0 = zext i8 %v2e0 to i32
+  %mul1 = mul nuw nsw i32 %cv1e0, %cv2e0
+
+  %v1e1 = extractelement <16 x i8> %vec1, i64 10
+  %cv1e1 = zext i8 %v1e1 to i32
+  %v2e1 = extractelement <8 x i8> %vec2, i64 1
+  %cv2e1 = zext i8 %v2e1 to i32
+  %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1
+
+  %v1e2 = extractelement <16 x i8> %vec1, i64 13
+  %cv1e2 = zext i8 %v1e2 to i32
+  %v2e2 = extractelement <8 x i8> %vec2, i64 2
+  %cv2e2 = zext i8 %v2e2 to i32
+  %mul3 = mul nuw nsw i32 %cv1e2, %cv2e2
+
+  %v1e3 = extractelement <16 x i8> %vec1, i64 15
+  %cv1e3 = zext i8 %v1e3 to i32
+  %v2e3 = extractelement <8 x i8> %vec2, i64 3
+  %cv2e3 = zext i8 %v2e3 to i32
+  %mul4 = mul nuw nsw i32 %cv1e3, %cv2e3
+
+  %add1 = add i32 %mul1, 0
+  %add2 = add i32 %add1, %mul2
+  %add3 = add i32 %add2, %mul3
+  %add4 = add i32 %add3, %mul4
+  store i32 %add4, ptr addrspace(1) %dst, align 4
+  ret void
+}
+
+define amdgpu_kernel void @idot4_acc32_v256i8(ptr addrspace(1) %src1,
+; GFX7-LABEL: idot4_acc32_v256i8:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    s_mov_b32 s10, 0
+; GFX7-NEXT:    s_mov_b32 s11, s3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 8, v0
+; GFX7-NEXT:    v_mov_b32_e32 v2, 0
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_mov_b64 s[6:7], s[10:11]
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v4, v2
+; GFX7-NEXT:    buffer_load_dword v0, v[1:2], s[4:7], 0 addr64 offset:252
+; GFX7-NEXT:    buffer_load_dword v1, v[3:4], s[8:11], 0 addr64
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_bfe_u32 v4, v0, 16, 8
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_bfe_u32 v5, v1, 8, 8
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 24, v0
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xff, v1
+; GFX7-NEXT:    v_mul_u32_u24_e32 v4, v4, v5
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xff, v0
+; GFX7-NEXT:    v_bfe_u32 v7, v1, 16, 8
+; GFX7-NEXT:    v_mad_u32_u24 v2, v2, v3, v4
+; GFX7-NEXT:    v_bfe_u32 v0, v0, 8, 8
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 24, v1
+; GFX7-NEXT:    v_mad_u32_u24 v2, v6, v7, v2
+; GFX7-NEXT:    v_mad_u32_u24 v0, v0, v1, v2
+; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: idot4_acc32_v256i8:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 8, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; GFX8-NEXT:    s_movk_i32 s2, 0xfc
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v2, s5
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, s4, v1
+; GFX8-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v3
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v2, vcc
+; GFX8-NEXT:    flat_load_dword v2, v[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    s_waitcnt vmcnt(1)
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xff, v4
+; GFX8-NEXT:    v_bfe_u32 v5, v4, 16, 8
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 24, v2
+; GFX8-NEXT:    v_mul_u32_u24_sdwa v7, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_1
+; GFX8-NEXT:    v_and_b32_e32 v8, 0xff, v2
+; GFX8-NEXT:    v_mad_u32_u24 v3, v6, v3, v7
+; GFX8-NEXT:    v_bfe_u32 v2, v2, 8, 8
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 24, v4
+; GFX8-NEXT:    v_mad_u32_u24 v3, v8, v5, v3
+; GFX8-NEXT:    v_mad_u32_u24 v2, v2, v4, v3
+; GFX8-NEXT:    flat_store_dword v[0:1], v2
+; GFX8-NEXT:    s_endpgm
+;
+; GFX9-NODL-LABEL: idot4_acc32_v256i8:
+; GFX9-NODL:       ; %bb.0: ; %entry
+; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NODL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v1, 8, v0
+; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NODL-NEXT:    global_load_dword v2, v1, s[4:5] offset:252
+; GFX9-NODL-NEXT:    global_load_dword v3, v0, s[6:7]
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v1, 24, v2
+; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NODL-NEXT:    v_and_b32_e32 v4, 0xff, v3
+; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_1
+; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v6, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_2
+; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_3
+; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, v1, v4, v5
+; GFX9-NODL-NEXT:    v_add3_u32 v1, v1, v6, v2
+; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX9-NODL-NEXT:    s_endpgm
+;
+; GFX9-DL-LABEL: idot4_acc32_v256i8:
+; GFX9-DL:       ; %bb.0: ; %entry
+; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v1, 8, v0
+; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; GFX9-DL-NEXT:    s_mov_b32 s0, 0x3020001
+; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
+; GFX9-DL-NEXT:    global_load_dword v3, v1, s[4:5] offset:252
+; GFX9-DL-NEXT:    s_mov_b32 s1, 0x1000302
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-DL-NEXT:    v_perm_b32 v1, v2, v2, s0
+; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-DL-NEXT:    v_perm_b32 v2, v3, v3, s1
+; GFX9-DL-NEXT:    v_dot4_u32_u8 v1, v2, v1, 0
+; GFX9-DL-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX9-DL-NEXT:    s_endpgm
+;
+; GFX10-DL-LABEL: idot4_acc32_v256i8:
+; GFX10-DL:       ; %bb.0: ; %entry
+; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
+; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
+; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    global_load_dword v2, v1, s[6:7]
+; GFX10-DL-NEXT:    global_load_dword v3, v0, s[4:5] offset:252
+; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-DL-NEXT:    v_perm_b32 v0, v2, v2, 0x3020001
+; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DL-NEXT:    v_perm_b32 v1, v3, v3, 0x1000302
+; GFX10-DL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-DL-NEXT:    v_dot4_u32_u8 v0, v1, v0, 0
+; GFX10-DL-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-DL-NEXT:    s_endpgm
+;
+; GFX11-DL-LABEL: idot4_acc32_v256i8:
+; GFX11-DL:       ; %bb.0: ; %entry
+; GFX11-DL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v1, 3, v0
+; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
+; GFX11-DL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    global_load_b32 v1, v1, s[6:7]
+; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[4:5] offset:252
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-DL-NEXT:    v_perm_b32 v1, v1, v1, 0x3020001
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DL-NEXT:    v_perm_b32 v0, v0, v0, 0x1000302
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DL-NEXT:    v_dot4_u32_u8 v0, v0, v1, 0
+; GFX11-DL-NEXT:    global_store_b32 v2, v0, s[0:1]
+; GFX11-DL-NEXT:    s_nop 0
+; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DL-NEXT:    s_endpgm
+                                       ptr addrspace(1) %src2,
+                                       ptr addrspace(1) nocapture %dst) {
+entry:
+  %idx = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep1 = getelementptr <256 x i8>, ptr addrspace(1) %src1, i32 %idx
+  %vec1 = load <256 x i8>, ptr addrspace(1) %gep1
+  %gep2 = getelementptr <8 x i8>, ptr addrspace(1) %src2, i32 %idx
+  %vec2 = load <8 x i8>, ptr addrspace(1) %gep2
+
+  %v1e0 = extractelement <256 x i8> %vec1, i64 255
+  %cv1e0 = zext i8 %v1e0 to i32
+  %v2e0 = extractelement <8 x i8> %vec2, i64 0
+  %cv2e0 = zext i8 %v2e0 to i32
+  %mul1 = mul nuw nsw i32 %cv1e0, %cv2e0
+
+  %v1e1 = extractelement <256 x i8> %vec1, i64 254
+  %cv1e1 = zext i8 %v1e1 to i32
+  %v2e1 = extractelement <8 x i8> %vec2, i64 1
+  %cv2e1 = zext i8 %v2e1 to i32
+  %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1
+
+  %v1e2 = extractelement <256 x i8> %vec1, i64 252
+  %cv1e2 = zext i8 %v1e2 to i32
+  %v2e2 = extractelement <8 x i8> %vec2, i64 2
+  %cv2e2 = zext i8 %v2e2 to i32
+  %mul3 = mul nuw nsw i32 %cv1e2, %cv2e2
+
+  %v1e3 = extractelement <256 x i8> %vec1, i64 253
+  %cv1e3 = zext i8 %v1e3 to i32
+  %v2e3 = extractelement <8 x i8> %vec2, i64 3
+  %cv2e3 = zext i8 %v2e3 to i32
+  %mul4 = mul nuw nsw i32 %cv1e3, %cv2e3
+
+  %add1 = add i32 %mul1, 0
+  %add2 = add i32 %add1, %mul2
+  %add3 = add i32 %add2, %mul3
+  %add4 = add i32 %add3, %mul4
+  store i32 %add4, ptr addrspace(1) %dst, align 4
+  ret void
+}
+
 declare i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
index f98b41ba199bd7f..3daa88a7474d36d 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
@@ -2226,14 +2226,12 @@ define amdgpu_kernel void @v_insertelement_v8f16_3(ptr addrspace(1) %out, ptr ad
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v4
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
-; VI-NEXT:    v_mov_b32_e32 v5, s1
-; VI-NEXT:    s_lshl_b32 s1, s4, 16
-; VI-NEXT:    s_mov_b32 s2, 0xffff
 ; VI-NEXT:    v_add_u32_e32 v4, vcc, s0, v4
-; VI-NEXT:    v_mov_b32_e32 v6, s1
+; VI-NEXT:    s_lshl_b32 s0, s4, 16
+; VI-NEXT:    v_mov_b32_e32 v5, s1
+; VI-NEXT:    v_mov_b32_e32 v6, s0
 ; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_bfi_b32 v3, s2, v3, v3
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; VI-NEXT:    s_endpgm
@@ -2308,14 +2306,13 @@ define amdgpu_kernel void @v_insertelement_v8i16_6(ptr addrspace(1) %out, ptr ad
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v4
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
-; VI-NEXT:    s_mov_b32 s2, 0xffff
 ; VI-NEXT:    v_mov_b32_e32 v5, s1
-; VI-NEXT:    v_mov_b32_e32 v6, s4
 ; VI-NEXT:    v_add_u32_e32 v4, vcc, s0, v4
+; VI-NEXT:    s_mov_b32 s0, 0xffff
+; VI-NEXT:    v_mov_b32_e32 v6, s4
 ; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_bfi_b32 v3, s2, v6, v3
-; VI-NEXT:    v_bfi_b32 v1, s2, v1, v1
+; VI-NEXT:    v_bfi_b32 v3, s0, v6, v3
 ; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; VI-NEXT:    s_endpgm
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/load-hi16.ll b/llvm/test/CodeGen/AMDGPU/load-hi16.ll
index ba025a2202313fb..26a1716db20271a 100644
--- a/llvm/test/CodeGen/AMDGPU/load-hi16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-hi16.ll
@@ -266,9 +266,9 @@ define <2 x i16> @load_local_hi_v2i16_reglo(ptr addrspace(3) %in, i16 %reg) #0 {
 ; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    s_mov_b32 m0, -1
 ; GFX803-NEXT:    ds_read_u16 v0, v0
-; GFX803-NEXT:    s_mov_b32 s4, 0x1000504
 ; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX803-NEXT:    v_perm_b32 v0, v1, v0, s4
+; GFX803-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX803-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-FLATSCR-LABEL: load_local_hi_v2i16_reglo:
@@ -311,9 +311,9 @@ define void @load_local_hi_v2i16_reglo_vreg(ptr addrspace(3) %in, i16 %reg) #0 {
 ; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    s_mov_b32 m0, -1
 ; GFX803-NEXT:    ds_read_u16 v0, v0
-; GFX803-NEXT:    s_mov_b32 s4, 0x1000504
 ; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX803-NEXT:    v_perm_b32 v0, v1, v0, s4
+; GFX803-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX803-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX803-NEXT:    flat_store_dword v[0:1], v0
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
@@ -696,9 +696,9 @@ define void @load_global_hi_v2i16_reglo_vreg(ptr addrspace(1) %in, i16 %reg) #0
 ; GFX803-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff002, v0
 ; GFX803-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX803-NEXT:    flat_load_ushort v0, v[0:1]
-; GFX803-NEXT:    s_mov_b32 s4, 0x1000504
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
-; GFX803-NEXT:    v_perm_b32 v0, v2, v0, s4
+; GFX803-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX803-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX803-NEXT:    flat_store_dword v[0:1], v0
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
@@ -1006,9 +1006,9 @@ define void @load_flat_hi_v2i16_reglo_vreg(ptr %in, i16 %reg) #0 {
 ; GFX803:       ; %bb.0: ; %entry
 ; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    flat_load_ushort v0, v[0:1]
-; GFX803-NEXT:    s_mov_b32 s4, 0x1000504
 ; GFX803-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX803-NEXT:    v_perm_b32 v0, v2, v0, s4
+; GFX803-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX803-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX803-NEXT:    flat_store_dword v[0:1], v0
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
@@ -1300,9 +1300,9 @@ define void @load_private_hi_v2i16_reglo_vreg(ptr addrspace(5) byval(i16) %in, i
 ; GFX803:       ; %bb.0: ; %entry
 ; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:4094
-; GFX803-NEXT:    s_mov_b32 s4, 0x1000504
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
-; GFX803-NEXT:    v_perm_b32 v0, v0, v1, s4
+; GFX803-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX803-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX803-NEXT:    flat_store_dword v[0:1], v0
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
@@ -1399,8 +1399,8 @@ define void @load_private_hi_v2i16_reglo_vreg_nooff(ptr addrspace(5) byval(i16)
 ; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    buffer_load_ushort v1, off, s[0:3], 0 offset:4094 glc
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
-; GFX803-NEXT:    s_mov_b32 s4, 0x1000504
-; GFX803-NEXT:    v_perm_b32 v0, v0, v1, s4
+; GFX803-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX803-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX803-NEXT:    flat_store_dword v[0:1], v0
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
@@ -1851,9 +1851,9 @@ define void @load_constant_hi_v2i16_reglo_vreg(ptr addrspace(4) %in, i16 %reg) #
 ; GFX803-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff002, v0
 ; GFX803-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX803-NEXT:    flat_load_ushort v0, v[0:1]
-; GFX803-NEXT:    s_mov_b32 s4, 0x1000504
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
-; GFX803-NEXT:    v_perm_b32 v0, v2, v0, s4
+; GFX803-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX803-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX803-NEXT:    flat_store_dword v[0:1], v0
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
@@ -2069,9 +2069,9 @@ define void @load_private_hi_v2i16_reglo_vreg_to_offset(i16 %reg, ptr addrspace(
 ; GFX803-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
 ; GFX803-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:4058
-; GFX803-NEXT:    s_mov_b32 s4, 0x1000504
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
-; GFX803-NEXT:    v_perm_b32 v0, v0, v1, s4
+; GFX803-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX803-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX803-NEXT:    flat_store_dword v[0:1], v0
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
@@ -2678,10 +2678,10 @@ define <2 x i16> @load_local_hi_v2i16_store_local_lo(i16 %reg, ptr addrspace(3)
 ; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    s_mov_b32 m0, -1
 ; GFX803-NEXT:    ds_read_u16 v2, v1
-; GFX803-NEXT:    s_mov_b32 s4, 0x1000504
 ; GFX803-NEXT:    ds_write_b16 v1, v0
 ; GFX803-NEXT:    s_waitcnt lgkmcnt(1)
-; GFX803-NEXT:    v_perm_b32 v2, v0, v2, s4
+; GFX803-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX803-NEXT:    v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX803-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/permute_i8.ll b/llvm/test/CodeGen/AMDGPU/permute_i8.ll
index c71f69edc76fa6e..2d73a06276b18a3 100644
--- a/llvm/test/CodeGen/AMDGPU/permute_i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/permute_i8.ll
@@ -3001,3 +3001,263 @@ define hidden void @extractbase(ptr addrspace(1) %in0, ptr addrspace(1) %in1, pt
   store i32 %res, ptr addrspace(1) %out0, align 4
   ret void
 }
+
+define hidden void @extract_hilo(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) {
+; GFX10-LABEL: extract_hilo:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v6, v[0:1], off offset:4
+; GFX10-NEXT:    global_load_dword v7, v[2:3], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_perm_b32 v0, v6, v7, 0x3060505
+; GFX10-NEXT:    global_store_dword v[4:5], v0, off
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: extract_hilo:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v6, v[0:1], off offset:4
+; GFX9-NEXT:    global_load_dword v7, v[2:3], off
+; GFX9-NEXT:    s_mov_b32 s4, 0x3060505
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_perm_b32 v0, v6, v7, s4
+; GFX9-NEXT:    global_store_dword v[4:5], v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %vec1 = load <8 x i8>, ptr addrspace(1) %in0, align 4
+  %vec2 = load <8 x i8>, ptr addrspace(1) %in1, align 4
+  %v1e5 = extractelement <8 x i8> %vec1, i64 5
+  %zv1e5 = zext i8 %v1e5 to i32
+  %byte1 = shl i32 %zv1e5, 8
+
+  %v1e6 = extractelement <8 x i8> %vec1, i64 6
+  %zv1e6 = zext i8 %v1e6 to i32
+  %byte2 = shl i32 %zv1e6, 16
+  %v2e3 = extractelement <8 x i8> %vec2, i64 3
+  %zv2e3 = zext i8 %v2e3 to i32
+  %byte3 = shl i32 %zv2e3, 24
+
+  %tmp0 = or i32 %zv1e5, %byte1
+  %tmp1 = or i32 %tmp0, %byte2
+  %res = or i32 %tmp1, %byte3
+  store i32 %res, ptr addrspace(1) %out0, align 4
+  ret void
+}
+
+define hidden void @extract_lohi(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) {
+; GFX10-LABEL: extract_lohi:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v6, v[0:1], off
+; GFX10-NEXT:    global_load_dword v7, v[2:3], off offset:4
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_perm_b32 v0, v6, v7, 0x70404
+; GFX10-NEXT:    global_store_dword v[4:5], v0, off
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: extract_lohi:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v6, v[0:1], off
+; GFX9-NEXT:    global_load_dword v7, v[2:3], off offset:4
+; GFX9-NEXT:    s_mov_b32 s4, 0x70404
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_perm_b32 v0, v6, v7, s4
+; GFX9-NEXT:    global_store_dword v[4:5], v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %vec1 = load <8 x i8>, ptr addrspace(1) %in0, align 4
+  %vec2 = load <8 x i8>, ptr addrspace(1) %in1, align 4
+  %v1e0 = extractelement <8 x i8> %vec1, i64 0
+  %zv1e0 = zext i8 %v1e0 to i32
+  %byte1 = shl i32 %zv1e0, 8
+
+  %v1e3 = extractelement <8 x i8> %vec1, i64 3
+  %zv1e3 = zext i8 %v1e3 to i32
+  %byte2 = shl i32 %zv1e3, 16
+  %v2e4 = extractelement <8 x i8> %vec2, i64 4
+  %zv2e4 = zext i8 %v2e4 to i32
+  %byte3 = shl i32 %zv2e4, 24
+
+  %tmp0 = or i32 %zv1e0, %byte1
+  %tmp1 = or i32 %tmp0, %byte2
+  %res = or i32 %tmp1, %byte3
+  store i32 %res, ptr addrspace(1) %out0, align 4
+  ret void
+}
+
+define hidden void @extract_hihi(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) {
+; GFX10-LABEL: extract_hihi:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v6, v[0:1], off offset:4
+; GFX10-NEXT:    global_load_dword v7, v[2:3], off offset:4
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_perm_b32 v0, v6, v7, 0x2070505
+; GFX10-NEXT:    global_store_dword v[4:5], v0, off
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: extract_hihi:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v6, v[0:1], off offset:4
+; GFX9-NEXT:    global_load_dword v7, v[2:3], off offset:4
+; GFX9-NEXT:    s_mov_b32 s4, 0x2070505
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_perm_b32 v0, v6, v7, s4
+; GFX9-NEXT:    global_store_dword v[4:5], v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %vec1 = load <8 x i8>, ptr addrspace(1) %in0, align 4
+  %vec2 = load <8 x i8>, ptr addrspace(1) %in1, align 4
+  %v1e5 = extractelement <8 x i8> %vec1, i64 5
+  %zv1e5 = zext i8 %v1e5 to i32
+  %byte1 = shl i32 %zv1e5, 8
+
+  %v1e7 = extractelement <8 x i8> %vec1, i64 7
+  %zv1e7 = zext i8 %v1e7 to i32
+  %byte2 = shl i32 %zv1e7, 16
+  %v2e6 = extractelement <8 x i8> %vec2, i64 6
+  %zv2e6 = zext i8 %v2e6 to i32
+  %byte3 = shl i32 %zv2e6, 24
+
+  %tmp0 = or i32 %zv1e5, %byte1
+  %tmp1 = or i32 %tmp0, %byte2
+  %res = or i32 %tmp1, %byte3
+  store i32 %res, ptr addrspace(1) %out0, align 4
+  ret void
+}
+
+define hidden void @extract_v8i8(ptr addrspace(1) %in0, ptr addrspace(1) %out0) {
+; GFX10-LABEL: extract_v8i8:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x1070404
+; GFX10-NEXT:    global_store_dword v[2:3], v0, off
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: extract_v8i8:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s4, 0x1070404
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
+; GFX9-NEXT:    global_store_dword v[2:3], v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %vec1 = load <8 x i8>, ptr addrspace(1) %in0, align 4
+  %v1e4 = extractelement <8 x i8> %vec1, i64 4
+  %zv1e4 = zext i8 %v1e4 to i32
+  %byte1 = shl i32 %zv1e4, 8
+
+  %v1e7 = extractelement <8 x i8> %vec1, i64 7
+  %zv1e7 = zext i8 %v1e7 to i32
+  %byte2 = shl i32 %zv1e7, 16
+  %v2e1 = extractelement <8 x i8> %vec1, i64 1
+  %zv2e1 = zext i8 %v2e1 to i32
+  %byte3 = shl i32 %zv2e1, 24
+
+  %tmp0 = or i32 %zv1e4, %byte1
+  %tmp1 = or i32 %tmp0, %byte2
+  %res = or i32 %tmp1, %byte3
+  store i32 %res, ptr addrspace(1) %out0, align 4
+  ret void
+}
+
+define hidden void @extract_v256i8(ptr addrspace(1) %in0, ptr addrspace(1) %out0) {
+; GFX10-LABEL: extract_v256i8:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off offset:252
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_perm_b32 v0, v0, v0, 0x6050707
+; GFX10-NEXT:    global_store_dword v[2:3], v0, off
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: extract_v256i8:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off offset:252
+; GFX9-NEXT:    s_mov_b32 s4, 0x6050707
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_perm_b32 v0, v0, v0, s4
+; GFX9-NEXT:    global_store_dword v[2:3], v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %vec1 = load <256 x i8>, ptr addrspace(1) %in0, align 4
+  %v1e4 = extractelement <256 x i8> %vec1, i64 255
+  %zv1e4 = zext i8 %v1e4 to i32
+  %byte1 = shl i32 %zv1e4, 8
+
+  %v1e7 = extractelement <256 x i8> %vec1, i64 253
+  %zv1e7 = zext i8 %v1e7 to i32
+  %byte2 = shl i32 %zv1e7, 16
+  %v2e1 = extractelement <256 x i8> %vec1, i64 254
+  %zv2e1 = zext i8 %v2e1 to i32
+  %byte3 = shl i32 %zv2e1, 24
+
+  %tmp0 = or i32 %zv1e4, %byte1
+  %tmp1 = or i32 %tmp0, %byte2
+  %res = or i32 %tmp1, %byte3
+  store i32 %res, ptr addrspace(1) %out0, align 4
+  ret void
+}
+
+; TODO : support this pattern
+define hidden void @extract_3src(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) {
+; GFX10-LABEL: extract_3src:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dwordx2 v[6:7], v[0:1], off
+; GFX10-NEXT:    global_load_dword v8, v[2:3], off offset:4
+; GFX10-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 8, v7
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 8, v8
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xff, v6
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xff0000, v0
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xff000000, v1
+; GFX10-NEXT:    v_lshl_or_b32 v2, v2, 8, v2
+; GFX10-NEXT:    v_or3_b32 v0, v2, v0, v1
+; GFX10-NEXT:    global_store_dword v[4:5], v0, off
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: extract_3src:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx2 v[6:7], v[0:1], off
+; GFX9-NEXT:    global_load_dword v8, v[2:3], off offset:4
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xff, v6
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v7
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 8, v8
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xff0000, v1
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xff000000, v2
+; GFX9-NEXT:    v_lshl_or_b32 v0, v0, 8, v0
+; GFX9-NEXT:    v_or3_b32 v0, v0, v1, v2
+; GFX9-NEXT:    global_store_dword v[4:5], v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %vec1 = load <8 x i8>, ptr addrspace(1) %in0, align 4
+  %vec2 = load <8 x i8>, ptr addrspace(1) %in1, align 4
+  %v1e0 = extractelement <8 x i8> %vec1, i64 0
+  %zv1e0 = zext i8 %v1e0 to i32
+  %byte1 = shl i32 %zv1e0, 8
+
+  %v1e5 = extractelement <8 x i8> %vec1, i64 5
+  %zv1e5 = zext i8 %v1e5 to i32
+  %byte2 = shl i32 %zv1e5, 16
+  %v2e6 = extractelement <8 x i8> %vec2, i64 6
+  %zv2e6 = zext i8 %v2e6 to i32
+  %byte3 = shl i32 %zv2e6, 24
+
+  %tmp0 = or i32 %zv1e0, %byte1
+  %tmp1 = or i32 %tmp0, %byte2
+  %res = or i32 %tmp1, %byte3
+  store i32 %res, ptr addrspace(1) %out0, align 4
+  ret void
+}

>From 0cba11c927e0e8b5d4ea906a3ef5dc03f1fb1e3d Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Wed, 6 Sep 2023 10:18:05 -0700
Subject: [PATCH 3/4] [AMDGPU]: Accept constant zero bytes in v_perm OrCombine

Change-Id: I53bf4080ce90f76f97b2f1de5c16987cb0512eaa
---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp  |  61 +++++-
 llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll  | 242 ++++++++++-----------
 llvm/test/CodeGen/AMDGPU/ds_read2.ll       |  18 +-
 llvm/test/CodeGen/AMDGPU/load-hi16.ll      |  24 +-
 llvm/test/CodeGen/AMDGPU/load-lo16.ll      |  48 ++--
 llvm/test/CodeGen/AMDGPU/load-local.128.ll |  49 +++--
 llvm/test/CodeGen/AMDGPU/load-local.96.ll  |  37 ++--
 llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll     |  18 +-
 llvm/test/CodeGen/AMDGPU/permute_i8.ll     |  26 +--
 llvm/test/CodeGen/AMDGPU/shl.v2i16.ll      |  18 +-
 10 files changed, 290 insertions(+), 251 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index a6386c116fa7808..403d61f1b836fab 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -10706,6 +10706,23 @@ calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex = 0,
                             Depth + 1);
   }
 
+  case ISD::EXTRACT_VECTOR_ELT: {
+    auto IdxOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
+    if (!IdxOp)
+      return std::nullopt;
+    auto VecIdx = IdxOp->getZExtValue();
+    auto ScalarSize = Op.getScalarValueSizeInBits();
+    if (ScalarSize != 32) {
+      if ((VecIdx + 1) * ScalarSize > 32)
+        return std::nullopt;
+      SrcIndex = ScalarSize == 8 ? VecIdx : VecIdx * 2 + SrcIndex;
+      return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
+    }
+
+    // The scalar is 32 bits, so just use the scalar
+    return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex);
+  }
+
   default: {
     if (auto A = dyn_cast<AtomicSDNode>(Op) || Op->isMemIntrinsic())
       return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex);
@@ -10983,6 +11000,9 @@ static bool addresses16Bits(int Mask) {
   int Low8 = Mask & 0xff;
   int Hi8 = (Mask & 0xff00) >> 8;
 
+  if (Low8 == 0x0c || Hi8 == 0x0c)
+    return false;
+
   assert(Low8 < 8 && Hi8 < 8);
   // Are the bytes contiguous in the order of increasing addresses.
   bool IsConsecutive = (Hi8 - Low8 == 1);
@@ -11099,12 +11119,33 @@ SDValue SITargetLowering::performOrCombine(SDNode *N,
     // If all the uses of an or need to extract the individual elements, do not
     // attempt to lower into v_perm
     auto usesCombinedOperand = [](SDNode *OrUse) {
+      //  The combined bytes seem to be getting extracted
+      if (OrUse->getOpcode() == ISD::SRL || OrUse->getOpcode() == ISD::TRUNCATE)
+        return false;
+
+      if (OrUse->getOpcode() == ISD::AND) {
+        auto SelectMask = dyn_cast<ConstantSDNode>(OrUse->getOperand(1));
+        if (SelectMask && (SelectMask->getZExtValue() == 0xFF))
+          return false;
+      }
+
+      if (OrUse->getOpcode() == AMDGPUISD::CVT_F32_UBYTE0 ||
+          OrUse->getOpcode() == AMDGPUISD::CVT_F32_UBYTE1 ||
+          OrUse->getOpcode() == AMDGPUISD::CVT_F32_UBYTE2 ||
+          OrUse->getOpcode() == AMDGPUISD::CVT_F32_UBYTE3) {
+        return false;
+      }
+
+      if (auto StoreUse = dyn_cast<StoreSDNode>(OrUse))
+        if (StoreUse->isTruncatingStore() &&
+            StoreUse->getMemoryVT().getSizeInBits() == 8)
+          return false;
+
       // If we have any non-vectorized use, then it is a candidate for v_perm
-      if (OrUse->getOpcode() != ISD::BITCAST ||
-          !OrUse->getValueType(0).isVector())
+      if (!(OrUse->getValueType(0).isVector() &&
+            OrUse->getOpcode() != ISD::BUILD_VECTOR))
         return true;
 
-      // If we have any non-vectorized use, then it is a candidate for v_perm
       for (auto VUse : OrUse->uses()) {
         if (!VUse->getValueType(0).isVector())
           return true;
@@ -11166,8 +11207,7 @@ SDValue SITargetLowering::performOrCombine(SDNode *N,
         // Find the ByteProvider that provides the ith byte of the result of OR
         std::optional<ByteProvider<SDValue>> P =
             calculateByteProvider(SDValue(N, 0), i, 0, /*StartingIndex = */ i);
-        // TODO support constantZero
-        if (!P || P->isConstantZero())
+        if (!P)
           return SDValue();
 
         PermNodes.push_back(*P);
@@ -11180,6 +11220,14 @@ SDValue SITargetLowering::performOrCombine(SDNode *N,
       uint64_t PermMask = 0x00000000;
       for (size_t i = 0; i < PermNodes.size(); i++) {
         auto PermOp = PermNodes[i];
+        if (PermOp.isConstantZero()) {
+          if (FirstSrc.first == i) {
+            FirstSrc.first = i + 1;
+            FirstSrc.second = PermNodes[i].SrcOffset / 4;
+          }
+          PermMask |= 0x0c << (i * 8);
+          continue;
+        }
         // Since the mask is applied to Src1:Src2, Src1 bytes must be offset
         // by sizeof(Src2) = 4
         int SrcByteAdjust = 4;
@@ -11203,6 +11251,8 @@ SDValue SITargetLowering::performOrCombine(SDNode *N,
         PermMask |= ((PermOp.SrcOffset % 4) + SrcByteAdjust) << (i * 8);
       }
       SDLoc DL(N);
+      if (PermMask == 0x0c0c0c0c)
+        return DAG.getConstant(0, DL, MVT::i32);
       SDValue Op = *PermNodes[FirstSrc.first].Src;
       Op = getDWordFromOffset(DAG, DL, Op, FirstSrc.second);
       assert(Op.getValueSizeInBits() == 32);
@@ -11229,7 +11279,6 @@ SDValue SITargetLowering::performOrCombine(SDNode *N,
       assert(Op.getValueSizeInBits() == 32);
 
       if (hasNon16BitAccesses(PermMask, Op, OtherOp)) {
-
         assert(Op.getValueType().isByteSized() &&
                OtherOp.getValueType().isByteSized());
 
diff --git a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
index 901cbd4a5272059..a02d11533a988f1 100644
--- a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
+++ b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
@@ -1428,7 +1428,8 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned_multiuse(<4 x float> add
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; VI-NEXT:    s_mov_b32 s8, 0x4000405
+; VI-NEXT:    s_mov_b32 s8, 0xc0c0004
+; VI-NEXT:    s_mov_b32 s9, 0x4000405
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, s4, v0
@@ -1438,35 +1439,31 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned_multiuse(<4 x float> add
 ; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 3, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
+; VI-NEXT:    v_add_u32_e32 v2, vcc, 2, v2
+; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    flat_load_ubyte v6, v[0:1]
-; VI-NEXT:    v_add_u32_e32 v0, vcc, 2, v2
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 3, v4
+; VI-NEXT:    flat_load_ubyte v7, v[2:3]
+; VI-NEXT:    v_add_u32_e32 v0, vcc, 3, v4
+; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v5, vcc
+; VI-NEXT:    v_add_u32_e32 v2, vcc, 2, v4
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
-; VI-NEXT:    v_add_u32_e32 v4, vcc, 2, v4
-; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
-; VI-NEXT:    flat_load_ubyte v2, v[2:3]
-; VI-NEXT:    flat_load_ubyte v3, v[4:5]
-; VI-NEXT:    flat_load_ubyte v4, v[0:1]
+; VI-NEXT:    flat_load_ubyte v0, v[0:1]
+; VI-NEXT:    flat_load_ubyte v1, v[2:3]
 ; VI-NEXT:    s_mov_b32 s7, 0xf000
 ; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_mov_b32 s4, s2
 ; VI-NEXT:    s_mov_b32 s5, s3
 ; VI-NEXT:    s_mov_b32 s2, s6
 ; VI-NEXT:    s_mov_b32 s3, s7
-; VI-NEXT:    s_waitcnt vmcnt(3)
-; VI-NEXT:    v_lshlrev_b32_e32 v5, 8, v6
-; VI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v6
 ; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    v_lshlrev_b32_e32 v7, 8, v2
-; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_cvt_f32_ubyte0_e32 v2, v3
+; VI-NEXT:    v_perm_b32 v3, v7, v6, s8
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_cvt_f32_ubyte0_e32 v1, v4
-; VI-NEXT:    v_or_b32_e32 v4, v5, v4
-; VI-NEXT:    v_or_b32_e32 v5, v7, v3
+; VI-NEXT:    v_perm_b32 v0, v1, v0, s8
+; VI-NEXT:    v_cvt_f32_ubyte0_e32 v1, v3
+; VI-NEXT:    v_perm_b32 v4, v3, v0, s9
+; VI-NEXT:    v_cvt_f32_ubyte0_e32 v2, v0
+; VI-NEXT:    v_cvt_f32_ubyte1_e32 v0, v3
 ; VI-NEXT:    v_mov_b32_e32 v3, v1
-; VI-NEXT:    v_perm_b32 v4, v4, v5, s8
 ; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; VI-NEXT:    buffer_store_dword v4, off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
@@ -1475,24 +1472,24 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned_multiuse(<4 x float> add
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-NEXT:    v_mov_b32_e32 v7, 0
+; GFX10-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_clause 0x3
 ; GFX10-NEXT:    global_load_ubyte v1, v0, s[4:5] offset:2
-; GFX10-NEXT:    global_load_ubyte v3, v0, s[4:5] offset:3
-; GFX10-NEXT:    global_load_ubyte v2, v0, s[6:7] offset:3
+; GFX10-NEXT:    global_load_ubyte v2, v0, s[4:5] offset:3
+; GFX10-NEXT:    global_load_ubyte v3, v0, s[6:7] offset:3
 ; GFX10-NEXT:    global_load_ubyte v4, v0, s[6:7] offset:2
 ; GFX10-NEXT:    s_waitcnt vmcnt(2)
-; GFX10-NEXT:    v_lshl_or_b32 v5, v3, 8, v1
-; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v1, v1
+; GFX10-NEXT:    v_perm_b32 v5, v1, v2, 0xc0c0004
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshl_or_b32 v6, v2, 8, v4
+; GFX10-NEXT:    v_perm_b32 v4, v4, v3, 0xc0c0004
+; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v1, v5
+; GFX10-NEXT:    v_cvt_f32_ubyte1_e32 v0, v5
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v2, v4
-; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v0, v3
+; GFX10-NEXT:    v_perm_b32 v4, v5, v4, 0x4000405
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v1
-; GFX10-NEXT:    v_perm_b32 v4, v5, v6, 0x4000405
-; GFX10-NEXT:    global_store_dwordx4 v7, v[0:3], s[0:1]
-; GFX10-NEXT:    global_store_dword v7, v4, s[2:3]
+; GFX10-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1]
+; GFX10-NEXT:    global_store_dword v6, v4, s[2:3]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: load_v4i8_to_v4f32_unaligned_multiuse:
@@ -1505,16 +1502,17 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned_multiuse(<4 x float> add
 ; GFX9-NEXT:    global_load_ubyte v2, v0, s[6:7] offset:3
 ; GFX9-NEXT:    global_load_ubyte v3, v0, s[4:5] offset:3
 ; GFX9-NEXT:    global_load_ubyte v4, v0, s[6:7] offset:2
-; GFX9-NEXT:    s_mov_b32 s4, 0x4000405
+; GFX9-NEXT:    s_mov_b32 s4, 0xc0c0004
+; GFX9-NEXT:    s_mov_b32 s5, 0x4000405
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_lshl_or_b32 v6, v3, 8, v1
-; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v1, v1
+; GFX9-NEXT:    v_perm_b32 v0, v1, v3, s4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshl_or_b32 v7, v2, 8, v4
-; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v2, v4
-; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v0, v3
+; GFX9-NEXT:    v_perm_b32 v1, v4, v2, s4
+; GFX9-NEXT:    v_perm_b32 v4, v0, v1, s5
+; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v2, v1
+; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v1, v0
+; GFX9-NEXT:    v_cvt_f32_ubyte1_e32 v0, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v3, v1
-; GFX9-NEXT:    v_perm_b32 v4, v6, v7, s4
 ; GFX9-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1]
 ; GFX9-NEXT:    global_store_dword v5, v4, s[2:3]
 ; GFX9-NEXT:    s_endpgm
@@ -1527,19 +1525,20 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned_multiuse(<4 x float> add
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_clause 0x3
 ; GFX11-NEXT:    global_load_u8 v1, v0, s[4:5] offset:2
-; GFX11-NEXT:    global_load_u8 v3, v0, s[4:5] offset:3
-; GFX11-NEXT:    global_load_u8 v2, v0, s[6:7] offset:3
+; GFX11-NEXT:    global_load_u8 v2, v0, s[4:5] offset:3
+; GFX11-NEXT:    global_load_u8 v3, v0, s[6:7] offset:3
 ; GFX11-NEXT:    global_load_u8 v0, v0, s[6:7] offset:2
 ; GFX11-NEXT:    s_waitcnt vmcnt(2)
-; GFX11-NEXT:    v_lshl_or_b32 v4, v3, 8, v1
-; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v1, v1
+; GFX11-NEXT:    v_perm_b32 v4, v1, v2, 0xc0c0004
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshl_or_b32 v5, v2, 8, v0
-; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v2, v0
-; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v0, v3
-; GFX11-NEXT:    v_mov_b32_e32 v3, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_perm_b32 v5, v0, v3, 0xc0c0004
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v1, v4
+; GFX11-NEXT:    v_cvt_f32_ubyte1_e32 v0, v4
+; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v2, v5
 ; GFX11-NEXT:    v_perm_b32 v4, v4, v5, 0x4000405
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    global_store_b128 v6, v[0:3], s[0:1]
 ; GFX11-NEXT:    global_store_b32 v6, v4, s[2:3]
@@ -1794,43 +1793,46 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(ptr addrspace(1) noalias %out, ptr
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; VI-NEXT:    s_mov_b32 s4, 0xc0c0004
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 5, v0
-; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
-; VI-NEXT:    flat_load_ubyte v10, v[2:3]
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 6, v0
+; VI-NEXT:    v_add_u32_e32 v2, vcc, 1, v0
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
-; VI-NEXT:    v_add_u32_e32 v4, vcc, 1, v0
+; VI-NEXT:    v_add_u32_e32 v4, vcc, 3, v0
 ; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
 ; VI-NEXT:    v_add_u32_e32 v6, vcc, 2, v0
 ; VI-NEXT:    v_addc_u32_e32 v7, vcc, 0, v1, vcc
-; VI-NEXT:    v_add_u32_e32 v8, vcc, 3, v0
-; VI-NEXT:    v_addc_u32_e32 v9, vcc, 0, v1, vcc
+; VI-NEXT:    flat_load_ubyte v8, v[0:1]
+; VI-NEXT:    flat_load_ubyte v9, v[2:3]
+; VI-NEXT:    flat_load_ubyte v10, v[4:5]
 ; VI-NEXT:    flat_load_ubyte v6, v[6:7]
-; VI-NEXT:    flat_load_ubyte v7, v[8:9]
-; VI-NEXT:    flat_load_ubyte v8, v[2:3]
-; VI-NEXT:    flat_load_ubyte v2, v[0:1]
-; VI-NEXT:    flat_load_ubyte v4, v[4:5]
+; VI-NEXT:    v_add_u32_e32 v2, vcc, 5, v0
+; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
+; VI-NEXT:    v_add_u32_e32 v4, vcc, 6, v0
+; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 4, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    flat_load_ubyte v9, v[0:1]
+; VI-NEXT:    flat_load_ubyte v2, v[2:3]
+; VI-NEXT:    flat_load_ubyte v3, v[4:5]
+; VI-NEXT:    flat_load_ubyte v0, v[0:1]
 ; VI-NEXT:    s_mov_b32 s3, 0xf000
 ; VI-NEXT:    s_mov_b32 s2, -1
-; VI-NEXT:    s_waitcnt vmcnt(6)
-; VI-NEXT:    v_cvt_f32_ubyte0_e32 v5, v10
-; VI-NEXT:    s_waitcnt vmcnt(4)
-; VI-NEXT:    v_cvt_f32_ubyte0_e32 v3, v7
+; VI-NEXT:    s_waitcnt vmcnt(5)
+; VI-NEXT:    v_perm_b32 v7, v8, v9, s4
+; VI-NEXT:    s_waitcnt vmcnt(3)
+; VI-NEXT:    v_perm_b32 v1, v6, v10, s4
 ; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v2
-; VI-NEXT:    v_cvt_f32_ubyte0_e32 v2, v6
+; VI-NEXT:    v_cvt_f32_ubyte0_e32 v5, v2
 ; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_cvt_f32_ubyte0_e32 v1, v4
-; VI-NEXT:    v_cvt_f32_ubyte0_e32 v6, v8
+; VI-NEXT:    v_cvt_f32_ubyte0_e32 v6, v3
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_cvt_f32_ubyte0_e32 v4, v9
+; VI-NEXT:    v_cvt_f32_ubyte0_e32 v4, v0
+; VI-NEXT:    v_cvt_f32_ubyte1_e32 v3, v1
+; VI-NEXT:    v_cvt_f32_ubyte0_e32 v2, v1
+; VI-NEXT:    v_cvt_f32_ubyte1_e32 v1, v7
+; VI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v7
 ; VI-NEXT:    buffer_store_dwordx3 v[4:6], off, s[0:3], 0 offset:16
 ; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
@@ -1839,90 +1841,86 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(ptr addrspace(1) noalias %out, ptr
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
-; GFX10-NEXT:    v_mov_b32_e32 v8, 0
+; GFX10-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_clause 0x5
-; GFX10-NEXT:    global_load_ubyte v4, v0, s[2:3] offset:6
-; GFX10-NEXT:    global_load_ubyte v1, v0, s[2:3] offset:3
-; GFX10-NEXT:    global_load_ubyte v2, v0, s[2:3] offset:2
+; GFX10-NEXT:    global_load_short_d16 v1, v0, s[2:3] offset:4
+; GFX10-NEXT:    global_load_ubyte v2, v0, s[2:3] offset:6
+; GFX10-NEXT:    global_load_ubyte v3, v0, s[2:3] offset:3
+; GFX10-NEXT:    global_load_ubyte v4, v0, s[2:3] offset:2
 ; GFX10-NEXT:    global_load_ubyte v5, v0, s[2:3] offset:1
-; GFX10-NEXT:    global_load_short_d16 v7, v0, s[2:3] offset:4
-; GFX10-NEXT:    global_load_ubyte v0, v0, s[2:3]
-; GFX10-NEXT:    s_waitcnt vmcnt(5)
-; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v6, v4
-; GFX10-NEXT:    s_waitcnt vmcnt(4)
-; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v3, v1
-; GFX10-NEXT:    s_waitcnt vmcnt(3)
-; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v2, v2
+; GFX10-NEXT:    global_load_ubyte v6, v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(2)
-; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v1, v5
-; GFX10-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-NEXT:    v_cvt_f32_ubyte1_e32 v5, v7
-; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v4, v7
+; GFX10-NEXT:    v_perm_b32 v0, v4, v3, 0xc0c0004
+; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v4, v1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
-; GFX10-NEXT:    global_store_dwordx3 v8, v[4:6], s[0:1] offset:16
-; GFX10-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
+; GFX10-NEXT:    v_perm_b32 v8, v6, v5, 0xc0c0004
+; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v6, v2
+; GFX10-NEXT:    v_cvt_f32_ubyte1_e32 v5, v1
+; GFX10-NEXT:    v_cvt_f32_ubyte1_e32 v3, v0
+; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v2, v0
+; GFX10-NEXT:    v_cvt_f32_ubyte1_e32 v1, v8
+; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v0, v8
+; GFX10-NEXT:    global_store_dwordx3 v7, v[4:6], s[0:1] offset:16
+; GFX10-NEXT:    global_store_dwordx4 v7, v[0:3], s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: load_v7i8_to_v7f32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
-; GFX9-NEXT:    v_mov_b32_e32 v10, 0
+; GFX9-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ubyte v1, v0, s[2:3] offset:6
-; GFX9-NEXT:    global_load_ushort v2, v0, s[2:3] offset:4
-; GFX9-NEXT:    global_load_ubyte v3, v0, s[2:3] offset:3
-; GFX9-NEXT:    global_load_ubyte v7, v0, s[2:3] offset:2
-; GFX9-NEXT:    global_load_ubyte v8, v0, s[2:3] offset:1
-; GFX9-NEXT:    global_load_ubyte v9, v0, s[2:3]
-; GFX9-NEXT:    s_waitcnt vmcnt(5)
-; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v6, v1
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
-; GFX9-NEXT:    v_cvt_f32_ubyte1_e32 v5, v2
-; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v4, v2
-; GFX9-NEXT:    s_waitcnt vmcnt(3)
-; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v3, v3
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v2, v7
+; GFX9-NEXT:    global_load_ushort v1, v0, s[2:3] offset:4
+; GFX9-NEXT:    global_load_ubyte v2, v0, s[2:3] offset:6
+; GFX9-NEXT:    global_load_ubyte v3, v0, s[2:3] offset:2
+; GFX9-NEXT:    global_load_ubyte v4, v0, s[2:3] offset:1
+; GFX9-NEXT:    global_load_ubyte v5, v0, s[2:3]
+; GFX9-NEXT:    global_load_ubyte v6, v0, s[2:3] offset:3
+; GFX9-NEXT:    s_mov_b32 s2, 0xc0c0004
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v1, v8
+; GFX9-NEXT:    v_perm_b32 v0, v5, v4, s2
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v0, v9
-; GFX9-NEXT:    global_store_dwordx4 v10, v[0:3], s[0:1]
-; GFX9-NEXT:    global_store_dwordx3 v10, v[4:6], s[0:1] offset:16
+; GFX9-NEXT:    v_perm_b32 v8, v3, v6, s2
+; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v6, v2
+; GFX9-NEXT:    v_cvt_f32_ubyte1_e32 v5, v1
+; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v4, v1
+; GFX9-NEXT:    v_cvt_f32_ubyte1_e32 v3, v8
+; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v2, v8
+; GFX9-NEXT:    v_cvt_f32_ubyte1_e32 v1, v0
+; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
+; GFX9-NEXT:    global_store_dwordx3 v7, v[4:6], s[0:1] offset:16
+; GFX9-NEXT:    global_store_dwordx4 v7, v[0:3], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: load_v7i8_to_v7f32:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
-; GFX11-NEXT:    v_mov_b32_e32 v8, 0
+; GFX11-NEXT:    v_dual_mov_b32 v7, 0 :: v_dual_lshlrev_b32 v0, 3, v0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_clause 0x5
-; GFX11-NEXT:    global_load_u8 v4, v0, s[2:3] offset:6
-; GFX11-NEXT:    global_load_u8 v1, v0, s[2:3] offset:3
-; GFX11-NEXT:    global_load_u8 v2, v0, s[2:3] offset:2
+; GFX11-NEXT:    global_load_d16_b16 v1, v0, s[2:3] offset:4
+; GFX11-NEXT:    global_load_u8 v2, v0, s[2:3] offset:6
+; GFX11-NEXT:    global_load_u8 v3, v0, s[2:3] offset:3
+; GFX11-NEXT:    global_load_u8 v4, v0, s[2:3] offset:2
 ; GFX11-NEXT:    global_load_u8 v5, v0, s[2:3] offset:1
-; GFX11-NEXT:    global_load_d16_b16 v7, v0, s[2:3] offset:4
 ; GFX11-NEXT:    global_load_u8 v0, v0, s[2:3]
-; GFX11-NEXT:    s_waitcnt vmcnt(5)
-; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v6, v4
 ; GFX11-NEXT:    s_waitcnt vmcnt(4)
-; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v3, v1
-; GFX11-NEXT:    s_waitcnt vmcnt(3)
-; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v2, v2
+; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v6, v2
 ; GFX11-NEXT:    s_waitcnt vmcnt(2)
-; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v1, v5
-; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_cvt_f32_ubyte1_e32 v5, v7
-; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v4, v7
+; GFX11-NEXT:    v_perm_b32 v8, v4, v3, 0xc0c0004
+; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v4, v1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v5, 0xc0c0004
+; GFX11-NEXT:    v_cvt_f32_ubyte1_e32 v5, v1
+; GFX11-NEXT:    v_cvt_f32_ubyte1_e32 v3, v8
+; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v2, v8
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_cvt_f32_ubyte1_e32 v1, v0
 ; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    global_store_b96 v8, v[4:6], s[0:1] offset:16
-; GFX11-NEXT:    global_store_b128 v8, v[0:3], s[0:1]
+; GFX11-NEXT:    global_store_b96 v7, v[4:6], s[0:1] offset:16
+; GFX11-NEXT:    global_store_b128 v7, v[0:3], s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/ds_read2.ll b/llvm/test/CodeGen/AMDGPU/ds_read2.ll
index 9ec9414d91171b7..4056f2d17b2bee9 100644
--- a/llvm/test/CodeGen/AMDGPU/ds_read2.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds_read2.ll
@@ -564,6 +564,7 @@ define amdgpu_kernel void @unaligned_read2_f32(ptr addrspace(1) %out, ptr addrsp
 ; GFX9-ALIGNED-NEXT:    s_load_dword s4, s[0:1], 0x8
 ; GFX9-ALIGNED-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX9-ALIGNED-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-ALIGNED-NEXT:    s_mov_b32 s0, 0xc0c0004
 ; GFX9-ALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-ALIGNED-NEXT:    v_add_u32_e32 v1, s4, v0
 ; GFX9-ALIGNED-NEXT:    ds_read_u8 v2, v1
@@ -575,14 +576,14 @@ define amdgpu_kernel void @unaligned_read2_f32(ptr addrspace(1) %out, ptr addrsp
 ; GFX9-ALIGNED-NEXT:    ds_read_u8 v8, v1 offset:34
 ; GFX9-ALIGNED-NEXT:    ds_read_u8 v1, v1 offset:35
 ; GFX9-ALIGNED-NEXT:    s_waitcnt lgkmcnt(6)
-; GFX9-ALIGNED-NEXT:    v_lshl_or_b32 v2, v3, 8, v2
+; GFX9-ALIGNED-NEXT:    v_perm_b32 v2, v2, v3, s0
 ; GFX9-ALIGNED-NEXT:    s_waitcnt lgkmcnt(4)
-; GFX9-ALIGNED-NEXT:    v_lshl_or_b32 v3, v5, 8, v4
+; GFX9-ALIGNED-NEXT:    v_perm_b32 v3, v4, v5, s0
 ; GFX9-ALIGNED-NEXT:    v_lshl_or_b32 v2, v3, 16, v2
 ; GFX9-ALIGNED-NEXT:    s_waitcnt lgkmcnt(2)
-; GFX9-ALIGNED-NEXT:    v_lshl_or_b32 v3, v7, 8, v6
+; GFX9-ALIGNED-NEXT:    v_perm_b32 v3, v6, v7, s0
 ; GFX9-ALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-ALIGNED-NEXT:    v_lshl_or_b32 v1, v1, 8, v8
+; GFX9-ALIGNED-NEXT:    v_perm_b32 v1, v8, v1, s0
 ; GFX9-ALIGNED-NEXT:    v_lshl_or_b32 v1, v1, 16, v3
 ; GFX9-ALIGNED-NEXT:    v_add_f32_e32 v1, v2, v1
 ; GFX9-ALIGNED-NEXT:    global_store_dword v0, v1, s[2:3]
@@ -657,6 +658,7 @@ define amdgpu_kernel void @unaligned_offset_read2_f32(ptr addrspace(1) %out, ptr
 ; GFX9-ALIGNED-NEXT:    s_load_dword s4, s[0:1], 0x8
 ; GFX9-ALIGNED-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX9-ALIGNED-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-ALIGNED-NEXT:    s_mov_b32 s0, 0xc0c0004
 ; GFX9-ALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-ALIGNED-NEXT:    v_add_u32_e32 v1, s4, v0
 ; GFX9-ALIGNED-NEXT:    ds_read_u8 v2, v1 offset:5
@@ -668,14 +670,14 @@ define amdgpu_kernel void @unaligned_offset_read2_f32(ptr addrspace(1) %out, ptr
 ; GFX9-ALIGNED-NEXT:    ds_read_u8 v8, v1 offset:11
 ; GFX9-ALIGNED-NEXT:    ds_read_u8 v1, v1 offset:12
 ; GFX9-ALIGNED-NEXT:    s_waitcnt lgkmcnt(6)
-; GFX9-ALIGNED-NEXT:    v_lshl_or_b32 v2, v3, 8, v2
+; GFX9-ALIGNED-NEXT:    v_perm_b32 v2, v2, v3, s0
 ; GFX9-ALIGNED-NEXT:    s_waitcnt lgkmcnt(4)
-; GFX9-ALIGNED-NEXT:    v_lshl_or_b32 v3, v5, 8, v4
+; GFX9-ALIGNED-NEXT:    v_perm_b32 v3, v4, v5, s0
 ; GFX9-ALIGNED-NEXT:    v_lshl_or_b32 v2, v3, 16, v2
 ; GFX9-ALIGNED-NEXT:    s_waitcnt lgkmcnt(2)
-; GFX9-ALIGNED-NEXT:    v_lshl_or_b32 v3, v7, 8, v6
+; GFX9-ALIGNED-NEXT:    v_perm_b32 v3, v6, v7, s0
 ; GFX9-ALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-ALIGNED-NEXT:    v_lshl_or_b32 v1, v1, 8, v8
+; GFX9-ALIGNED-NEXT:    v_perm_b32 v1, v8, v1, s0
 ; GFX9-ALIGNED-NEXT:    v_lshl_or_b32 v1, v1, 16, v3
 ; GFX9-ALIGNED-NEXT:    v_add_f32_e32 v1, v2, v1
 ; GFX9-ALIGNED-NEXT:    global_store_dword v0, v1, s[2:3]
diff --git a/llvm/test/CodeGen/AMDGPU/load-hi16.ll b/llvm/test/CodeGen/AMDGPU/load-hi16.ll
index 26a1716db20271a..2497860bd1b4423 100644
--- a/llvm/test/CodeGen/AMDGPU/load-hi16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-hi16.ll
@@ -491,9 +491,9 @@ define void @load_local_hi_v2i16_reglo_vreg_zexti8(ptr addrspace(3) %in, i16 %re
 ; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    s_mov_b32 m0, -1
 ; GFX803-NEXT:    ds_read_u8 v0, v0
+; GFX803-NEXT:    s_mov_b32 s4, 0xc000504
 ; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX803-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX803-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX803-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX803-NEXT:    flat_store_dword v[0:1], v0
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
@@ -798,9 +798,9 @@ define void @load_global_hi_v2i16_reglo_vreg_zexti8(ptr addrspace(1) %in, i16 %r
 ; GFX803-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff001, v0
 ; GFX803-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX803-NEXT:    flat_load_ubyte v0, v[0:1]
+; GFX803-NEXT:    s_mov_b32 s4, 0xc000504
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
-; GFX803-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX803-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX803-NEXT:    v_perm_b32 v0, v2, v0, s4
 ; GFX803-NEXT:    flat_store_dword v[0:1], v0
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
@@ -1102,9 +1102,9 @@ define void @load_flat_hi_v2i16_reglo_vreg_zexti8(ptr %in, i16 %reg) #0 {
 ; GFX803:       ; %bb.0: ; %entry
 ; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    flat_load_ubyte v0, v[0:1]
+; GFX803-NEXT:    s_mov_b32 s4, 0xc000504
 ; GFX803-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX803-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX803-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX803-NEXT:    v_perm_b32 v0, v2, v0, s4
 ; GFX803-NEXT:    flat_store_dword v[0:1], v0
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
@@ -1496,9 +1496,9 @@ define void @load_private_hi_v2i16_reglo_vreg_zexti8(ptr addrspace(5) byval(i8)
 ; GFX803:       ; %bb.0: ; %entry
 ; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    buffer_load_ubyte v1, off, s[0:3], s32 offset:4095
+; GFX803-NEXT:    s_mov_b32 s4, 0xc000504
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
-; GFX803-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX803-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX803-NEXT:    v_perm_b32 v0, v0, v1, s4
 ; GFX803-NEXT:    flat_store_dword v[0:1], v0
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
@@ -1699,8 +1699,8 @@ define void @load_private_hi_v2i16_reglo_vreg_nooff_zexti8(ptr addrspace(5) %in,
 ; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0 offset:4094 glc
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
-; GFX803-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX803-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX803-NEXT:    s_mov_b32 s4, 0xc000504
+; GFX803-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX803-NEXT:    flat_store_dword v[0:1], v0
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
@@ -2196,9 +2196,9 @@ define void @load_private_hi_v2i16_reglo_vreg_zexti8_to_offset(i16 %reg, ptr add
 ; GFX803-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
 ; GFX803-NEXT:    buffer_load_ubyte v1, off, s[0:3], s32 offset:4059
+; GFX803-NEXT:    s_mov_b32 s4, 0xc000504
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
-; GFX803-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX803-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX803-NEXT:    v_perm_b32 v0, v0, v1, s4
 ; GFX803-NEXT:    flat_store_dword v[0:1], v0
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/load-lo16.ll b/llvm/test/CodeGen/AMDGPU/load-lo16.ll
index 5843ac77baa9614..d57131d944b620d 100644
--- a/llvm/test/CodeGen/AMDGPU/load-lo16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-lo16.ll
@@ -314,9 +314,9 @@ define void @load_local_lo_v2i16_reghi_vreg_zexti8(ptr addrspace(3) %in, i32 %re
 ; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    s_mov_b32 m0, -1
 ; GFX803-NEXT:    ds_read_u8 v0, v0
-; GFX803-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX803-NEXT:    s_mov_b32 s4, 0x3020c04
 ; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX803-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX803-NEXT:    v_perm_b32 v0, v0, v1, s4
 ; GFX803-NEXT:    flat_store_dword v[0:1], v0
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
@@ -357,9 +357,9 @@ define void @load_local_lo_v2i16_reglo_vreg_zexti8(ptr addrspace(3) %in, i16 %re
 ; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    s_mov_b32 m0, -1
 ; GFX803-NEXT:    ds_read_u8 v0, v0
-; GFX803-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX803-NEXT:    s_mov_b32 s4, 0x1000c04
 ; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX803-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX803-NEXT:    v_perm_b32 v0, v0, v1, s4
 ; GFX803-NEXT:    flat_store_dword v[0:1], v0
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
@@ -878,9 +878,9 @@ define void @load_global_lo_v2i16_reglo_vreg_zexti8(ptr addrspace(1) %in, i32 %r
 ; GFX803-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff001, v0
 ; GFX803-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX803-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX803-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
+; GFX803-NEXT:    s_mov_b32 s4, 0x3020c04
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
-; GFX803-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX803-NEXT:    v_perm_b32 v0, v0, v2, s4
 ; GFX803-NEXT:    flat_store_dword v[0:1], v0
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
@@ -964,9 +964,9 @@ define void @load_global_lo_v2f16_reglo_vreg_zexti8(ptr addrspace(1) %in, i32 %r
 ; GFX803-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff001, v0
 ; GFX803-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX803-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX803-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
+; GFX803-NEXT:    s_mov_b32 s4, 0x3020c04
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
-; GFX803-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX803-NEXT:    v_perm_b32 v0, v0, v2, s4
 ; GFX803-NEXT:    flat_store_dword v[0:1], v0
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
@@ -1130,9 +1130,9 @@ define void @load_flat_lo_v2i16_reglo_vreg_zexti8(ptr %in, i32 %reg) #0 {
 ; GFX803:       ; %bb.0: ; %entry
 ; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX803-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
+; GFX803-NEXT:    s_mov_b32 s4, 0x3020c04
 ; GFX803-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX803-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX803-NEXT:    v_perm_b32 v0, v0, v2, s4
 ; GFX803-NEXT:    flat_store_dword v[0:1], v0
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
@@ -1210,9 +1210,9 @@ define void @load_flat_lo_v2f16_reglo_vreg_zexti8(ptr %in, i32 %reg) #0 {
 ; GFX803:       ; %bb.0: ; %entry
 ; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX803-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
+; GFX803-NEXT:    s_mov_b32 s4, 0x3020c04
 ; GFX803-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX803-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX803-NEXT:    v_perm_b32 v0, v0, v2, s4
 ; GFX803-NEXT:    flat_store_dword v[0:1], v0
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
@@ -1590,9 +1590,9 @@ define void @load_private_lo_v2i16_reglo_vreg_zexti8(ptr addrspace(5) byval(i8)
 ; GFX803:       ; %bb.0: ; %entry
 ; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    buffer_load_ubyte v1, off, s[0:3], s32 offset:4095
-; GFX803-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX803-NEXT:    s_mov_b32 s4, 0x3020c04
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
-; GFX803-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX803-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX803-NEXT:    flat_store_dword v[0:1], v0
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
@@ -1691,8 +1691,8 @@ define void @load_private_lo_v2i16_reglo_vreg_nooff_zexti8(ptr addrspace(5) %in,
 ; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0 offset:4094 glc
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
-; GFX803-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX803-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX803-NEXT:    s_mov_b32 s4, 0x3020c04
+; GFX803-NEXT:    v_perm_b32 v0, v0, v1, s4
 ; GFX803-NEXT:    flat_store_dword v[0:1], v0
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
@@ -1791,8 +1791,8 @@ define void @load_private_lo_v2f16_reglo_vreg_nooff_zexti8(ptr addrspace(5) %in,
 ; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0 offset:4094 glc
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
-; GFX803-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX803-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX803-NEXT:    s_mov_b32 s4, 0x3020c04
+; GFX803-NEXT:    v_perm_b32 v0, v0, v1, s4
 ; GFX803-NEXT:    flat_store_dword v[0:1], v0
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
@@ -1927,9 +1927,9 @@ define void @load_constant_lo_v2f16_reglo_vreg_zexti8(ptr addrspace(4) %in, i32
 ; GFX803-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff001, v0
 ; GFX803-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX803-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX803-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
+; GFX803-NEXT:    s_mov_b32 s4, 0x3020c04
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
-; GFX803-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX803-NEXT:    v_perm_b32 v0, v0, v2, s4
 ; GFX803-NEXT:    flat_store_dword v[0:1], v0
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
@@ -2163,8 +2163,8 @@ define void @load_private_lo_v2i16_reglo_vreg_zexti8_to_offset(i32 %reg) #0 {
 ; GFX803-NEXT:    v_mov_b32_e32 v2, 44
 ; GFX803-NEXT:    buffer_load_ubyte v1, v2, s[0:3], s32 offen offset:4055 glc
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
-; GFX803-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX803-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX803-NEXT:    s_mov_b32 s4, 0x3020c04
+; GFX803-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX803-NEXT:    flat_store_dword v[0:1], v0
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
@@ -2302,8 +2302,8 @@ define void @load_private_lo_v2f16_reglo_vreg_zexti8_to_offset(i32 %reg) #0 {
 ; GFX803-NEXT:    v_mov_b32_e32 v2, 44
 ; GFX803-NEXT:    buffer_load_ubyte v1, v2, s[0:3], s32 offen offset:4055 glc
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
-; GFX803-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX803-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX803-NEXT:    s_mov_b32 s4, 0x3020c04
+; GFX803-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX803-NEXT:    flat_store_dword v[0:1], v0
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/load-local.128.ll b/llvm/test/CodeGen/AMDGPU/load-local.128.ll
index 10dca76cc389aef..20eec9923aad110 100644
--- a/llvm/test/CodeGen/AMDGPU/load-local.128.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-local.128.ll
@@ -69,25 +69,26 @@ define <4 x i32> @load_lds_v4i32_align1(ptr addrspace(3) %ptr) {
 ; GFX9-NEXT:    ds_read_u8 v14, v0 offset:13
 ; GFX9-NEXT:    ds_read_u8 v15, v0 offset:14
 ; GFX9-NEXT:    ds_read_u8 v16, v0 offset:15
+; GFX9-NEXT:    s_mov_b32 s4, 0xc0c0004
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(14)
-; GFX9-NEXT:    v_lshl_or_b32 v0, v2, 8, v1
+; GFX9-NEXT:    v_perm_b32 v0, v1, v2, s4
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(12)
-; GFX9-NEXT:    v_lshl_or_b32 v1, v4, 8, v3
+; GFX9-NEXT:    v_perm_b32 v1, v3, v4, s4
 ; GFX9-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(10)
-; GFX9-NEXT:    v_lshl_or_b32 v1, v6, 8, v5
+; GFX9-NEXT:    v_perm_b32 v1, v5, v6, s4
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(8)
-; GFX9-NEXT:    v_lshl_or_b32 v2, v8, 8, v7
+; GFX9-NEXT:    v_perm_b32 v2, v7, v8, s4
 ; GFX9-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(6)
-; GFX9-NEXT:    v_lshl_or_b32 v2, v10, 8, v9
+; GFX9-NEXT:    v_perm_b32 v2, v9, v10, s4
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(4)
-; GFX9-NEXT:    v_lshl_or_b32 v3, v12, 8, v11
+; GFX9-NEXT:    v_perm_b32 v3, v11, v12, s4
 ; GFX9-NEXT:    v_lshl_or_b32 v2, v3, 16, v2
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(2)
-; GFX9-NEXT:    v_lshl_or_b32 v3, v14, 8, v13
+; GFX9-NEXT:    v_perm_b32 v3, v13, v14, s4
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_lshl_or_b32 v4, v16, 8, v15
+; GFX9-NEXT:    v_perm_b32 v4, v15, v16, s4
 ; GFX9-NEXT:    v_lshl_or_b32 v3, v4, 16, v3
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -239,21 +240,21 @@ define <4 x i32> @load_lds_v4i32_align1(ptr addrspace(3) %ptr) {
 ; GFX10-NEXT:    ds_read_u8 v15, v0 offset:14
 ; GFX10-NEXT:    ds_read_u8 v0, v0 offset:15
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(14)
-; GFX10-NEXT:    v_lshl_or_b32 v1, v2, 8, v1
+; GFX10-NEXT:    v_perm_b32 v1, v1, v2, 0xc0c0004
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(12)
-; GFX10-NEXT:    v_lshl_or_b32 v2, v4, 8, v3
+; GFX10-NEXT:    v_perm_b32 v2, v3, v4, 0xc0c0004
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(10)
-; GFX10-NEXT:    v_lshl_or_b32 v3, v6, 8, v5
+; GFX10-NEXT:    v_perm_b32 v3, v5, v6, 0xc0c0004
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(8)
-; GFX10-NEXT:    v_lshl_or_b32 v4, v8, 8, v7
+; GFX10-NEXT:    v_perm_b32 v4, v7, v8, 0xc0c0004
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(6)
-; GFX10-NEXT:    v_lshl_or_b32 v5, v10, 8, v9
+; GFX10-NEXT:    v_perm_b32 v5, v9, v10, 0xc0c0004
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(4)
-; GFX10-NEXT:    v_lshl_or_b32 v6, v12, 8, v11
+; GFX10-NEXT:    v_perm_b32 v6, v11, v12, 0xc0c0004
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(2)
-; GFX10-NEXT:    v_lshl_or_b32 v7, v14, 8, v13
+; GFX10-NEXT:    v_perm_b32 v7, v13, v14, 0xc0c0004
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_lshl_or_b32 v8, v0, 8, v15
+; GFX10-NEXT:    v_perm_b32 v8, v15, v0, 0xc0c0004
 ; GFX10-NEXT:    v_lshl_or_b32 v0, v2, 16, v1
 ; GFX10-NEXT:    v_lshl_or_b32 v1, v4, 16, v3
 ; GFX10-NEXT:    v_lshl_or_b32 v2, v6, 16, v5
@@ -280,21 +281,21 @@ define <4 x i32> @load_lds_v4i32_align1(ptr addrspace(3) %ptr) {
 ; GFX11-NEXT:    ds_load_u8 v15, v0 offset:14
 ; GFX11-NEXT:    ds_load_u8 v0, v0 offset:15
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(14)
-; GFX11-NEXT:    v_lshl_or_b32 v1, v2, 8, v1
+; GFX11-NEXT:    v_perm_b32 v1, v1, v2, 0xc0c0004
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(12)
-; GFX11-NEXT:    v_lshl_or_b32 v2, v4, 8, v3
+; GFX11-NEXT:    v_perm_b32 v2, v3, v4, 0xc0c0004
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(10)
-; GFX11-NEXT:    v_lshl_or_b32 v3, v6, 8, v5
+; GFX11-NEXT:    v_perm_b32 v3, v5, v6, 0xc0c0004
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(8)
-; GFX11-NEXT:    v_lshl_or_b32 v4, v8, 8, v7
+; GFX11-NEXT:    v_perm_b32 v4, v7, v8, 0xc0c0004
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(6)
-; GFX11-NEXT:    v_lshl_or_b32 v5, v10, 8, v9
+; GFX11-NEXT:    v_perm_b32 v5, v9, v10, 0xc0c0004
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(4)
-; GFX11-NEXT:    v_lshl_or_b32 v6, v12, 8, v11
+; GFX11-NEXT:    v_perm_b32 v6, v11, v12, 0xc0c0004
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(2)
-; GFX11-NEXT:    v_lshl_or_b32 v7, v14, 8, v13
+; GFX11-NEXT:    v_perm_b32 v7, v13, v14, 0xc0c0004
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_lshl_or_b32 v8, v0, 8, v15
+; GFX11-NEXT:    v_perm_b32 v8, v15, v0, 0xc0c0004
 ; GFX11-NEXT:    v_lshl_or_b32 v0, v2, 16, v1
 ; GFX11-NEXT:    v_lshl_or_b32 v1, v4, 16, v3
 ; GFX11-NEXT:    v_lshl_or_b32 v2, v6, 16, v5
diff --git a/llvm/test/CodeGen/AMDGPU/load-local.96.ll b/llvm/test/CodeGen/AMDGPU/load-local.96.ll
index 2da3fce72072ee7..b1eb3dd7c02c422 100644
--- a/llvm/test/CodeGen/AMDGPU/load-local.96.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-local.96.ll
@@ -65,20 +65,21 @@ define <3 x i32> @load_lds_v3i32_align1(ptr addrspace(3) %ptr) {
 ; GFX9-NEXT:    ds_read_u8 v10, v0 offset:9
 ; GFX9-NEXT:    ds_read_u8 v11, v0 offset:10
 ; GFX9-NEXT:    ds_read_u8 v12, v0 offset:11
+; GFX9-NEXT:    s_mov_b32 s4, 0xc0c0004
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(10)
-; GFX9-NEXT:    v_lshl_or_b32 v0, v2, 8, v1
+; GFX9-NEXT:    v_perm_b32 v0, v1, v2, s4
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(8)
-; GFX9-NEXT:    v_lshl_or_b32 v1, v4, 8, v3
+; GFX9-NEXT:    v_perm_b32 v1, v3, v4, s4
 ; GFX9-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(6)
-; GFX9-NEXT:    v_lshl_or_b32 v1, v6, 8, v5
+; GFX9-NEXT:    v_perm_b32 v1, v5, v6, s4
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(4)
-; GFX9-NEXT:    v_lshl_or_b32 v2, v8, 8, v7
+; GFX9-NEXT:    v_perm_b32 v2, v7, v8, s4
 ; GFX9-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(2)
-; GFX9-NEXT:    v_lshl_or_b32 v2, v10, 8, v9
+; GFX9-NEXT:    v_perm_b32 v2, v9, v10, s4
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_lshl_or_b32 v3, v12, 8, v11
+; GFX9-NEXT:    v_perm_b32 v3, v11, v12, s4
 ; GFX9-NEXT:    v_lshl_or_b32 v2, v3, 16, v2
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -200,17 +201,17 @@ define <3 x i32> @load_lds_v3i32_align1(ptr addrspace(3) %ptr) {
 ; GFX10-NEXT:    ds_read_u8 v11, v0 offset:10
 ; GFX10-NEXT:    ds_read_u8 v0, v0 offset:11
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(10)
-; GFX10-NEXT:    v_lshl_or_b32 v1, v2, 8, v1
+; GFX10-NEXT:    v_perm_b32 v1, v1, v2, 0xc0c0004
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(8)
-; GFX10-NEXT:    v_lshl_or_b32 v2, v4, 8, v3
+; GFX10-NEXT:    v_perm_b32 v2, v3, v4, 0xc0c0004
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(6)
-; GFX10-NEXT:    v_lshl_or_b32 v3, v6, 8, v5
+; GFX10-NEXT:    v_perm_b32 v3, v5, v6, 0xc0c0004
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(4)
-; GFX10-NEXT:    v_lshl_or_b32 v4, v8, 8, v7
+; GFX10-NEXT:    v_perm_b32 v4, v7, v8, 0xc0c0004
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(2)
-; GFX10-NEXT:    v_lshl_or_b32 v5, v10, 8, v9
+; GFX10-NEXT:    v_perm_b32 v5, v9, v10, 0xc0c0004
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_lshl_or_b32 v6, v0, 8, v11
+; GFX10-NEXT:    v_perm_b32 v6, v11, v0, 0xc0c0004
 ; GFX10-NEXT:    v_lshl_or_b32 v0, v2, 16, v1
 ; GFX10-NEXT:    v_lshl_or_b32 v1, v4, 16, v3
 ; GFX10-NEXT:    v_lshl_or_b32 v2, v6, 16, v5
@@ -232,17 +233,17 @@ define <3 x i32> @load_lds_v3i32_align1(ptr addrspace(3) %ptr) {
 ; GFX11-NEXT:    ds_load_u8 v11, v0 offset:10
 ; GFX11-NEXT:    ds_load_u8 v0, v0 offset:11
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(10)
-; GFX11-NEXT:    v_lshl_or_b32 v1, v2, 8, v1
+; GFX11-NEXT:    v_perm_b32 v1, v1, v2, 0xc0c0004
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(8)
-; GFX11-NEXT:    v_lshl_or_b32 v2, v4, 8, v3
+; GFX11-NEXT:    v_perm_b32 v2, v3, v4, 0xc0c0004
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(6)
-; GFX11-NEXT:    v_lshl_or_b32 v3, v6, 8, v5
+; GFX11-NEXT:    v_perm_b32 v3, v5, v6, 0xc0c0004
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(4)
-; GFX11-NEXT:    v_lshl_or_b32 v4, v8, 8, v7
+; GFX11-NEXT:    v_perm_b32 v4, v7, v8, 0xc0c0004
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(2)
-; GFX11-NEXT:    v_lshl_or_b32 v5, v10, 8, v9
+; GFX11-NEXT:    v_perm_b32 v5, v9, v10, 0xc0c0004
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_lshl_or_b32 v6, v0, 8, v11
+; GFX11-NEXT:    v_perm_b32 v6, v11, v0, 0xc0c0004
 ; GFX11-NEXT:    v_lshl_or_b32 v0, v2, 16, v1
 ; GFX11-NEXT:    v_lshl_or_b32 v1, v4, 16, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
diff --git a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll
index 7389827b5090b23..177c3e9470c49f2 100644
--- a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll
@@ -456,13 +456,12 @@ define amdgpu_kernel void @lshr_v_imm_v2i16(ptr addrspace(1) %out, ptr addrspace
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v3, v[0:1]
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
+; VI-NEXT:    s_mov_b32 s0, 0xc070c05
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 24, v3
-; VI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; VI-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+; VI-NEXT:    v_perm_b32 v2, v3, v3, s0
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
@@ -639,16 +638,15 @@ define amdgpu_kernel void @lshr_v_imm_v4i16(ptr addrspace(1) %out, ptr addrspace
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; VI-NEXT:    s_mov_b32 s2, 0xc010c05
+; VI-NEXT:    s_mov_b32 s3, 0xc070c05
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_lshrrev_b32_e32 v4, 24, v1
-; VI-NEXT:    v_lshrrev_b32_e32 v5, 24, v0
-; VI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; VI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; VI-NEXT:    v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+; VI-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
+; VI-NEXT:    v_perm_b32 v0, v0, v4, s2
+; VI-NEXT:    v_perm_b32 v1, v1, v1, s3
 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; VI-NEXT:    s_endpgm
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/permute_i8.ll b/llvm/test/CodeGen/AMDGPU/permute_i8.ll
index 2d73a06276b18a3..e4f4af2a07f8e09 100644
--- a/llvm/test/CodeGen/AMDGPU/permute_i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/permute_i8.ll
@@ -3211,34 +3211,28 @@ define hidden void @extract_3src(ptr addrspace(1) %in0, ptr addrspace(1) %in1, p
 ; GFX10-LABEL: extract_3src:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx2 v[6:7], v[0:1], off
 ; GFX10-NEXT:    global_load_dword v8, v[2:3], off offset:4
+; GFX10-NEXT:    global_load_dwordx2 v[6:7], v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 8, v7
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 8, v8
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 8, v8
-; GFX10-NEXT:    v_and_b32_e32 v2, 0xff, v6
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xff0000, v0
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xff000000, v1
-; GFX10-NEXT:    v_lshl_or_b32 v2, v2, 8, v2
-; GFX10-NEXT:    v_or3_b32 v0, v2, v0, v1
+; GFX10-NEXT:    v_perm_b32 v1, v6, v7, 0xc010404
+; GFX10-NEXT:    v_and_or_b32 v0, 0xff000000, v0, v1
 ; GFX10-NEXT:    global_store_dword v[4:5], v0, off
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: extract_3src:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[6:7], v[0:1], off
 ; GFX9-NEXT:    global_load_dword v8, v[2:3], off offset:4
+; GFX9-NEXT:    global_load_dwordx2 v[6:7], v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s4, 0xc010404
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xff, v6
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v7
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 8, v8
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 8, v8
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xff0000, v1
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xff000000, v2
-; GFX9-NEXT:    v_lshl_or_b32 v0, v0, 8, v0
-; GFX9-NEXT:    v_or3_b32 v0, v0, v1, v2
+; GFX9-NEXT:    v_perm_b32 v1, v6, v7, s4
+; GFX9-NEXT:    s_mov_b32 s4, 0xff000000
+; GFX9-NEXT:    v_and_or_b32 v0, v0, s4, v1
 ; GFX9-NEXT:    global_store_dword v[4:5], v0, off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll
index 3d6d4d49b82266c..e389fadc5cda996 100644
--- a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll
@@ -468,14 +468,12 @@ define amdgpu_kernel void @shl_v_imm_v2i16(ptr addrspace(1) %out, ptr addrspace(
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v3, v[0:1]
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
+; VI-NEXT:    s_mov_b32 s0, 0x60c040c
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v3
-; VI-NEXT:    v_and_b32_e32 v2, 0xff000000, v2
-; VI-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
-; VI-NEXT:    v_or_b32_e32 v2, v3, v2
+; VI-NEXT:    v_perm_b32 v2, v3, v3, s0
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
@@ -652,18 +650,16 @@ define amdgpu_kernel void @shl_v_imm_v4i16(ptr addrspace(1) %out, ptr addrspace(
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; VI-NEXT:    s_mov_b32 s2, 0x20c000c
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_lshlrev_b32_e32 v4, 8, v1
-; VI-NEXT:    v_lshlrev_b16_e32 v5, 8, v0
+; VI-NEXT:    v_perm_b32 v1, v0, v1, s2
+; VI-NEXT:    v_lshlrev_b16_e32 v4, 8, v0
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
-; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; VI-NEXT:    v_and_b32_e32 v4, 0xff000000, v4
 ; VI-NEXT:    v_and_b32_e32 v0, 0xff000000, v0
-; VI-NEXT:    v_or_b32_e32 v1, v1, v4
-; VI-NEXT:    v_or_b32_e32 v0, v5, v0
+; VI-NEXT:    v_or_b32_e32 v0, v4, v0
 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; VI-NEXT:    s_endpgm
 ;

>From 4ee0c89e26fe2a1654284598a975b18b8b1da5a1 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Thu, 14 Sep 2023 12:20:06 -0700
Subject: [PATCH 4/4] [AMDGPU] Add IR LiveReg type-based optimization

Change-Id: I816f6fdcba38e177a9f5bcf7d49b9da28eeed13b
---
 .../Target/AMDGPU/AMDGPUCodeGenPrepare.cpp    |  343 +++
 .../amdgpu-codegenprepare-break-large-phis.ll |  125 +-
 .../test/CodeGen/AMDGPU/vni8-across-blocks.ll | 2692 ++++++-----------
 3 files changed, 1407 insertions(+), 1753 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index 4cce34bdeabcf44..b50379e98d0f6b5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -106,6 +106,7 @@ class AMDGPUCodeGenPrepareImpl
   Module *Mod = nullptr;
   const DataLayout *DL = nullptr;
   bool HasUnsafeFPMath = false;
+  bool UsesGlobalISel = false;
   bool HasFP32DenormalFlush = false;
   bool FlowChanged = false;
   mutable Function *SqrtF32 = nullptr;
@@ -341,6 +342,85 @@ class AMDGPUCodeGenPrepare : public FunctionPass {
   StringRef getPassName() const override { return "AMDGPU IR optimizations"; }
 };
 
+class LiveRegConversion {
+private:
+  // The instruction which defined the original virtual register used across
+  // blocks
+  Instruction *LiveRegDef;
+  // The original type
+  Type *OriginalType;
+  // The desired type
+  Type *NewType;
+  // The instruction sequence that converts the virtual register, to be used
+  // instead of the original
+  std::optional<Instruction *> Converted;
+  // The builder used to build the conversion instruction
+  IRBuilder<> ConvertBuilder;
+
+public:
+  // The instruction which defined the original virtual register used across
+  // blocks
+  Instruction *getLiveRegDef() { return LiveRegDef; }
+  // The original type
+  Type *getOriginalType() { return OriginalType; }
+  // The desired type
+  Type *getNewType() { return NewType; }
+  void setNewType(Type *NewType) { this->NewType = NewType; }
+  // The instruction that conerts the virtual register, to be used instead of
+  // the original
+  std::optional<Instruction *> &getConverted() { return Converted; }
+  void setConverted(Instruction *Converted) { this->Converted = Converted; }
+  // The builder used to build the conversion instruction
+  IRBuilder<> &getConverBuilder() { return ConvertBuilder; }
+  // Do we have a instruction sequence which convert the original virtual
+  // register
+  bool hasConverted() { return Converted.has_value(); }
+
+  LiveRegConversion(Instruction *LiveRegDef, BasicBlock *InsertBlock,
+                    BasicBlock::iterator InsertPt)
+      : LiveRegDef(LiveRegDef), OriginalType(LiveRegDef->getType()),
+        ConvertBuilder(InsertBlock, InsertPt) {}
+  LiveRegConversion(Instruction *LiveRegDef, Type *NewType,
+                    BasicBlock *InsertBlock, BasicBlock::iterator InsertPt)
+      : LiveRegDef(LiveRegDef), OriginalType(LiveRegDef->getType()),
+        NewType(NewType), ConvertBuilder(InsertBlock, InsertPt) {}
+};
+
+class LiveRegOptimizer {
+private:
+  Module *Mod = nullptr;
+  // The scalar type to convert to
+  Type *ConvertToScalar;
+  // Holds the collection of PHIs with their pending new operands
+  SmallVector<std::pair<Instruction *,
+                        SmallVector<std::pair<Instruction *, BasicBlock *>, 4>>,
+              4>
+      PHIUpdater;
+
+public:
+  // Should the def of the instruction be converted if it is live across blocks
+  bool shouldReplaceUses(const Instruction &I);
+  // Convert the virtual register to the compatible vector of legal type
+  void convertToOptType(LiveRegConversion &LR);
+  // Convert the virtual register back to the original type, stripping away
+  // the MSBs in cases where there was an imperfect fit (e.g. v2i32 -> v7i8)
+  void convertFromOptType(LiveRegConversion &LR);
+  // Get a vector of desired scalar type that is compatible with the original
+  // vector. In cases where there is no bitsize equivalent using a legal vector
+  // type, we pad the MSBs (e.g. v7i8 -> v2i32)
+  Type *getCompatibleType(Instruction *InstToConvert);
+  // Find and replace uses of the virtual register in different block with a
+  // newly produced virtual register of legal type
+  bool replaceUses(Instruction &I);
+  // Replace the collected PHIs with newly produced incoming values. Replacement
+  // is only done if we have a replacement for each original incoming value.
+  bool replacePHIs();
+
+  LiveRegOptimizer(Module *Mod) : Mod(Mod) {
+    ConvertToScalar = Type::getInt32Ty(Mod->getContext());
+  }
+};
+
 } // end anonymous namespace
 
 bool AMDGPUCodeGenPrepareImpl::run(Function &F) {
@@ -358,6 +438,7 @@ bool AMDGPUCodeGenPrepareImpl::run(Function &F) {
       Next = std::next(I);
 
       MadeChange |= visit(*I);
+      I->getType();
 
       if (Next != E) { // Control flow changed
         BasicBlock *NextInstBB = Next->getParent();
@@ -369,9 +450,269 @@ bool AMDGPUCodeGenPrepareImpl::run(Function &F) {
       }
     }
   }
+
+  // GlobalISel should directly use the values, and do not need to emit
+  // CopyTo/CopyFrom Regs across blocks
+  if (UsesGlobalISel)
+    return MadeChange;
+
+  // "Optimize" the virtual regs that cross basic block boundaries. In such
+  // cases, vectors of illegal types will be scalarized and widened, with each
+  // scalar living in its own physical register. The optimization converts the
+  // vectors to equivalent vectors of legal type (which are convereted back
+  // before uses in subsequenmt blocks), to pack the bits into fewer physical
+  // registers (used in CopyToReg/CopyFromReg pairs).
+  LiveRegOptimizer LRO(Mod);
+  for (auto &BB : F) {
+    for (auto &I : BB) {
+      if (!LRO.shouldReplaceUses(I))
+        continue;
+      MadeChange |= LRO.replaceUses(I);
+    }
+  }
+
+  MadeChange |= LRO.replacePHIs();
+  return MadeChange;
+}
+
+bool LiveRegOptimizer::replaceUses(Instruction &I) {
+  bool MadeChange = false;
+
+  struct ConvertUseInfo {
+    Instruction *Converted;
+    SmallVector<Instruction *, 4> Users;
+  };
+  DenseMap<BasicBlock *, ConvertUseInfo> UseConvertTracker;
+
+  LiveRegConversion FromLRC(
+      &I, I.getParent(),
+      static_cast<BasicBlock::iterator>(std::next(I.getIterator())));
+  FromLRC.setNewType(getCompatibleType(FromLRC.getLiveRegDef()));
+  for (auto IUser = I.user_begin(); IUser != I.user_end(); IUser++) {
+
+    if (auto UserInst = dyn_cast<Instruction>(*IUser)) {
+      if (UserInst->getParent() != I.getParent()) {
+        LLVM_DEBUG(dbgs() << *UserInst << "\n\tUses "
+                          << *FromLRC.getOriginalType()
+                          << " from previous block. Needs conversion\n");
+        convertToOptType(FromLRC);
+        if (!FromLRC.hasConverted())
+          continue;
+        // If it is a PHI node, just create and collect the new operand. We can
+        // only replace the PHI node once we have converted all the operands
+        if (auto PhiInst = dyn_cast<PHINode>(UserInst)) {
+          for (unsigned Idx = 0; Idx < PhiInst->getNumIncomingValues(); Idx++) {
+            auto IncVal = PhiInst->getIncomingValue(Idx);
+            if (&I == dyn_cast<Instruction>(IncVal)) {
+              auto IncBlock = PhiInst->getIncomingBlock(Idx);
+              auto PHIOps = find_if(
+                  PHIUpdater,
+                  [&UserInst](
+                      std::pair<Instruction *,
+                                SmallVector<
+                                    std::pair<Instruction *, BasicBlock *>, 4>>
+                          &Entry) { return Entry.first == UserInst; });
+
+              if (PHIOps == PHIUpdater.end())
+                PHIUpdater.push_back(
+                    {UserInst, {{*FromLRC.getConverted(), IncBlock}}});
+              else
+                PHIOps->second.push_back({*FromLRC.getConverted(), IncBlock});
+
+              break;
+            }
+          }
+          continue;
+        }
+
+        // Do not create multiple conversion sequences if there are multiple
+        // uses in the same block
+        if (UseConvertTracker.contains(UserInst->getParent())) {
+          UseConvertTracker[UserInst->getParent()].Users.push_back(UserInst);
+          LLVM_DEBUG(dbgs() << "\tUser already has access to converted def\n");
+          continue;
+        }
+
+        LiveRegConversion ToLRC(*FromLRC.getConverted(), I.getType(),
+                                UserInst->getParent(),
+                                static_cast<BasicBlock::iterator>(
+                                    UserInst->getParent()->getFirstNonPHIIt()));
+        convertFromOptType(ToLRC);
+        assert(ToLRC.hasConverted());
+        UseConvertTracker[UserInst->getParent()] = {*ToLRC.getConverted(),
+                                                    {UserInst}};
+      }
+    }
+  }
+
+  // Replace uses of with in a separate loop that is not dependent upon the
+  // state of the uses
+  for (auto &Entry : UseConvertTracker) {
+    for (auto &UserInst : Entry.second.Users) {
+      LLVM_DEBUG(dbgs() << *UserInst
+                        << "\n\tNow uses: " << *Entry.second.Converted << "\n");
+      UserInst->replaceUsesOfWith(&I, Entry.second.Converted);
+      MadeChange = true;
+    }
+  }
+  return MadeChange;
+}
+
+bool LiveRegOptimizer::replacePHIs() {
+  bool MadeChange = false;
+  for (auto Ele : PHIUpdater) {
+    auto ThePHINode = dyn_cast<PHINode>(Ele.first);
+    assert(ThePHINode);
+    auto NewPHINodeOps = Ele.second;
+    LLVM_DEBUG(dbgs() << "Attempting to replace: " << *ThePHINode << "\n");
+    // If we have conveted all the required operands, then do the replacement
+    if (ThePHINode->getNumIncomingValues() == NewPHINodeOps.size()) {
+      IRBuilder<> Builder(Ele.first);
+      auto NPHI = Builder.CreatePHI(NewPHINodeOps[0].first->getType(),
+                                    NewPHINodeOps.size());
+      for (auto IncVals : NewPHINodeOps) {
+        NPHI->addIncoming(IncVals.first, IncVals.second);
+        LLVM_DEBUG(dbgs() << "  Using: " << *IncVals.first
+                          << "  For: " << IncVals.second->getName() << "\n");
+      }
+      LLVM_DEBUG(dbgs() << "Sucessfully replaced with " << *NPHI << "\n");
+      LiveRegConversion ToLRC(NPHI, ThePHINode->getType(),
+                              ThePHINode->getParent(),
+                              static_cast<BasicBlock::iterator>(
+                                  ThePHINode->getParent()->getFirstNonPHIIt()));
+      convertFromOptType(ToLRC);
+      assert(ToLRC.hasConverted());
+      Ele.first->replaceAllUsesWith(*ToLRC.getConverted());
+      // The old PHI is no longer used
+      ThePHINode->eraseFromParent();
+      MadeChange = true;
+    }
+  }
   return MadeChange;
 }
 
+Type *LiveRegOptimizer::getCompatibleType(Instruction *InstToConvert) {
+  auto OriginalType = InstToConvert->getType();
+  assert(OriginalType->getScalarSizeInBits() <=
+         ConvertToScalar->getScalarSizeInBits());
+  auto VTy = dyn_cast<VectorType>(OriginalType);
+  if (!VTy)
+    return ConvertToScalar;
+
+  auto OriginalSize =
+      VTy->getScalarSizeInBits() * VTy->getElementCount().getFixedValue();
+  auto ConvertScalarSize = ConvertToScalar->getScalarSizeInBits();
+  auto ConvertEltCount =
+      (OriginalSize + ConvertScalarSize - 1) / ConvertScalarSize;
+
+  return VectorType::get(Type::getIntNTy(Mod->getContext(), ConvertScalarSize),
+                         llvm::ElementCount::getFixed(ConvertEltCount));
+}
+
+void LiveRegOptimizer::convertToOptType(LiveRegConversion &LR) {
+  if (LR.hasConverted()) {
+    LLVM_DEBUG(dbgs() << "\tAlready has converted def\n");
+    return;
+  }
+
+  auto VTy = dyn_cast<VectorType>(LR.getOriginalType());
+  assert(VTy);
+  auto NewVTy = dyn_cast<VectorType>(LR.getNewType());
+  assert(NewVTy);
+
+  auto V = static_cast<Value *>(LR.getLiveRegDef());
+  auto OriginalSize =
+      VTy->getScalarSizeInBits() * VTy->getElementCount().getFixedValue();
+  auto NewSize =
+      NewVTy->getScalarSizeInBits() * NewVTy->getElementCount().getFixedValue();
+
+  auto &Builder = LR.getConverBuilder();
+
+  // If there is a bitsize match, we can fit the old vector into a new vector of
+  // desired type
+  if (OriginalSize == NewSize) {
+    LR.setConverted(dyn_cast<Instruction>(Builder.CreateBitCast(V, NewVTy)));
+    LLVM_DEBUG(dbgs() << "\tConverted def to "
+                      << *(*LR.getConverted())->getType() << "\n");
+    return;
+  }
+
+  // If there is a bitsize mismatch, we must use a wider vector
+  assert(NewSize > OriginalSize);
+  auto ExpandedVecElementCount =
+      llvm::ElementCount::getFixed(NewSize / VTy->getScalarSizeInBits());
+
+  SmallVector<int, 8> ShuffleMask;
+  for (unsigned I = 0; I < VTy->getElementCount().getFixedValue(); I++)
+    ShuffleMask.push_back(I);
+
+  for (uint64_t I = VTy->getElementCount().getFixedValue();
+       I < ExpandedVecElementCount.getFixedValue(); I++)
+    ShuffleMask.push_back(VTy->getElementCount().getFixedValue());
+
+  auto ExpandedVec =
+      dyn_cast<Instruction>(Builder.CreateShuffleVector(V, ShuffleMask));
+  LR.setConverted(
+      dyn_cast<Instruction>(Builder.CreateBitCast(ExpandedVec, NewVTy)));
+  LLVM_DEBUG(dbgs() << "\tConverted def to " << *(*LR.getConverted())->getType()
+                    << "\n");
+  return;
+}
+
+void LiveRegOptimizer::convertFromOptType(LiveRegConversion &LRC) {
+  auto VTy = dyn_cast<VectorType>(LRC.getOriginalType());
+  assert(VTy);
+  auto NewVTy = dyn_cast<VectorType>(LRC.getNewType());
+  assert(NewVTy);
+
+  auto V = static_cast<Value *>(LRC.getLiveRegDef());
+  auto OriginalSize =
+      VTy->getScalarSizeInBits() * VTy->getElementCount().getFixedValue();
+  auto NewSize =
+      NewVTy->getScalarSizeInBits() * NewVTy->getElementCount().getFixedValue();
+
+  auto &Builder = LRC.getConverBuilder();
+
+  // If there is a bitsize match, we simply convert back to the original type
+  if (OriginalSize == NewSize) {
+    LRC.setConverted(dyn_cast<Instruction>(Builder.CreateBitCast(V, NewVTy)));
+    LLVM_DEBUG(dbgs() << "\tProduced for user: " << **LRC.getConverted()
+                      << "\n");
+    return;
+  }
+
+  // If there is a bitsize mismatch, we have used a wider vector and must strip
+  // the MSBs to convert back to the original type
+  assert(OriginalSize > NewSize);
+  auto ExpandedVecElementCount = llvm::ElementCount::getFixed(
+      OriginalSize / NewVTy->getScalarSizeInBits());
+  auto ExpandedVT = VectorType::get(
+      Type::getIntNTy(Mod->getContext(), NewVTy->getScalarSizeInBits()),
+      ExpandedVecElementCount);
+  auto Converted = dyn_cast<Instruction>(
+      Builder.CreateBitCast(LRC.getLiveRegDef(), ExpandedVT));
+
+  auto NarrowElementCount = NewVTy->getElementCount().getFixedValue();
+  SmallVector<int, 8> ShuffleMask;
+  for (uint64_t I = 0; I < NarrowElementCount; I++)
+    ShuffleMask.push_back(I);
+
+  auto NarrowVec = dyn_cast<Instruction>(
+      Builder.CreateShuffleVector(Converted, ShuffleMask));
+  LRC.setConverted(dyn_cast<Instruction>(NarrowVec));
+  LLVM_DEBUG(dbgs() << "\tProduced for user: " << **LRC.getConverted() << "\n");
+  return;
+}
+
+bool LiveRegOptimizer::shouldReplaceUses(const Instruction &I) {
+  // Vectors of illegal types are copied across blocks in an efficient manner.
+  // They are scalarized and widened to legal scalars. In such cases, we can do
+  // better by using legal vector types
+  auto IType = I.getType();
+  return IType->isVectorTy() && IType->getScalarSizeInBits() < 16 &&
+         !I.getType()->getScalarType()->isPointerTy();
+}
+
 unsigned AMDGPUCodeGenPrepareImpl::getBaseElementBitWidth(const Type *T) const {
   assert(needsPromotionToI32(T) && "T does not need promotion to i32");
 
@@ -2230,6 +2571,7 @@ bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) {
   Impl.ST = &TM.getSubtarget<GCNSubtarget>(F);
   Impl.AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
   Impl.UA = &getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
+  Impl.UsesGlobalISel = TM.Options.EnableGlobalISel;
   auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
   Impl.DT = DTWP ? &DTWP->getDomTree() : nullptr;
   Impl.HasUnsafeFPMath = hasUnsafeFPMath(F);
@@ -2250,6 +2592,7 @@ PreservedAnalyses AMDGPUCodeGenPreparePass::run(Function &F,
   Impl.UA = &FAM.getResult<UniformityInfoAnalysis>(F);
   Impl.DT = FAM.getCachedResult<DominatorTreeAnalysis>(F);
   Impl.HasUnsafeFPMath = hasUnsafeFPMath(F);
+  Impl.UsesGlobalISel = TM.Options.EnableGlobalISel;
   SIModeRegisterDefaults Mode(F);
   Impl.HasFP32DenormalFlush =
       Mode.FP32Denormals == DenormalMode::getPreserveSign();
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-break-large-phis.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-break-large-phis.ll
index 192bf7c249817be..1326d988cf31643 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-break-large-phis.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-break-large-phis.ll
@@ -495,10 +495,15 @@ define amdgpu_kernel void @phi_v23i8(<23 x i8> %in, ptr %out, i1 %cond) {
 ; OPT:       then:
 ; OPT-NEXT:    [[X:%.*]] = insertelement <23 x i8> [[IN:%.*]], i8 42, i32 3
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE0:%.*]] = shufflevector <23 x i8> [[X]], <23 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; OPT-NEXT:    [[TMP0:%.*]] = bitcast <4 x i8> [[LARGEPHI_EXTRACTSLICE0]] to <1 x i32>
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE2:%.*]] = shufflevector <23 x i8> [[X]], <23 x i8> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; OPT-NEXT:    [[TMP1:%.*]] = bitcast <4 x i8> [[LARGEPHI_EXTRACTSLICE2]] to <1 x i32>
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE4:%.*]] = shufflevector <23 x i8> [[X]], <23 x i8> poison, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; OPT-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[LARGEPHI_EXTRACTSLICE4]] to <1 x i32>
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE6:%.*]] = shufflevector <23 x i8> [[X]], <23 x i8> poison, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; OPT-NEXT:    [[TMP3:%.*]] = bitcast <4 x i8> [[LARGEPHI_EXTRACTSLICE6]] to <1 x i32>
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE8:%.*]] = shufflevector <23 x i8> [[X]], <23 x i8> poison, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; OPT-NEXT:    [[TMP4:%.*]] = bitcast <4 x i8> [[LARGEPHI_EXTRACTSLICE8]] to <1 x i32>
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE10:%.*]] = extractelement <23 x i8> [[X]], i64 20
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE12:%.*]] = extractelement <23 x i8> [[X]], i64 21
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE14:%.*]] = extractelement <23 x i8> [[X]], i64 22
@@ -506,31 +511,41 @@ define amdgpu_kernel void @phi_v23i8(<23 x i8> %in, ptr %out, i1 %cond) {
 ; OPT:       else:
 ; OPT-NEXT:    [[Y:%.*]] = insertelement <23 x i8> [[IN]], i8 64, i32 6
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE1:%.*]] = shufflevector <23 x i8> [[Y]], <23 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; OPT-NEXT:    [[TMP5:%.*]] = bitcast <4 x i8> [[LARGEPHI_EXTRACTSLICE1]] to <1 x i32>
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE3:%.*]] = shufflevector <23 x i8> [[Y]], <23 x i8> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; OPT-NEXT:    [[TMP6:%.*]] = bitcast <4 x i8> [[LARGEPHI_EXTRACTSLICE3]] to <1 x i32>
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE5:%.*]] = shufflevector <23 x i8> [[Y]], <23 x i8> poison, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; OPT-NEXT:    [[TMP7:%.*]] = bitcast <4 x i8> [[LARGEPHI_EXTRACTSLICE5]] to <1 x i32>
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE7:%.*]] = shufflevector <23 x i8> [[Y]], <23 x i8> poison, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; OPT-NEXT:    [[TMP8:%.*]] = bitcast <4 x i8> [[LARGEPHI_EXTRACTSLICE7]] to <1 x i32>
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE9:%.*]] = shufflevector <23 x i8> [[Y]], <23 x i8> poison, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; OPT-NEXT:    [[TMP9:%.*]] = bitcast <4 x i8> [[LARGEPHI_EXTRACTSLICE9]] to <1 x i32>
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE11:%.*]] = extractelement <23 x i8> [[Y]], i64 20
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE13:%.*]] = extractelement <23 x i8> [[Y]], i64 21
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE15:%.*]] = extractelement <23 x i8> [[Y]], i64 22
 ; OPT-NEXT:    br label [[FINALLY]]
 ; OPT:       finally:
-; OPT-NEXT:    [[TMP0:%.*]] = phi <4 x i8> [ [[LARGEPHI_EXTRACTSLICE0]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE1]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP1:%.*]] = phi <4 x i8> [ [[LARGEPHI_EXTRACTSLICE2]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE3]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP2:%.*]] = phi <4 x i8> [ [[LARGEPHI_EXTRACTSLICE4]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE5]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP3:%.*]] = phi <4 x i8> [ [[LARGEPHI_EXTRACTSLICE6]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE7]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP4:%.*]] = phi <4 x i8> [ [[LARGEPHI_EXTRACTSLICE8]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE9]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP5:%.*]] = phi i8 [ [[LARGEPHI_EXTRACTSLICE10]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE11]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP6:%.*]] = phi i8 [ [[LARGEPHI_EXTRACTSLICE12]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE13]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP7:%.*]] = phi i8 [ [[LARGEPHI_EXTRACTSLICE14]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE15]], [[ELSE]] ]
-; OPT-NEXT:    [[LARGEPHI_INSERTSLICE0:%.*]] = call <23 x i8> @llvm.vector.insert.v23i8.v4i8(<23 x i8> poison, <4 x i8> [[TMP0]], i64 0)
-; OPT-NEXT:    [[LARGEPHI_INSERTSLICE1:%.*]] = call <23 x i8> @llvm.vector.insert.v23i8.v4i8(<23 x i8> [[LARGEPHI_INSERTSLICE0]], <4 x i8> [[TMP1]], i64 4)
-; OPT-NEXT:    [[LARGEPHI_INSERTSLICE2:%.*]] = call <23 x i8> @llvm.vector.insert.v23i8.v4i8(<23 x i8> [[LARGEPHI_INSERTSLICE1]], <4 x i8> [[TMP2]], i64 8)
-; OPT-NEXT:    [[LARGEPHI_INSERTSLICE3:%.*]] = call <23 x i8> @llvm.vector.insert.v23i8.v4i8(<23 x i8> [[LARGEPHI_INSERTSLICE2]], <4 x i8> [[TMP3]], i64 12)
-; OPT-NEXT:    [[LARGEPHI_INSERTSLICE4:%.*]] = call <23 x i8> @llvm.vector.insert.v23i8.v4i8(<23 x i8> [[LARGEPHI_INSERTSLICE3]], <4 x i8> [[TMP4]], i64 16)
-; OPT-NEXT:    [[LARGEPHI_INSERTSLICE5:%.*]] = insertelement <23 x i8> [[LARGEPHI_INSERTSLICE4]], i8 [[TMP5]], i64 20
-; OPT-NEXT:    [[LARGEPHI_INSERTSLICE6:%.*]] = insertelement <23 x i8> [[LARGEPHI_INSERTSLICE5]], i8 [[TMP6]], i64 21
-; OPT-NEXT:    [[LARGEPHI_INSERTSLICE7:%.*]] = insertelement <23 x i8> [[LARGEPHI_INSERTSLICE6]], i8 [[TMP7]], i64 22
+; OPT-NEXT:    [[TMP10:%.*]] = phi <1 x i32> [ [[TMP0]], [[THEN]] ], [ [[TMP5]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP11:%.*]] = phi <1 x i32> [ [[TMP1]], [[THEN]] ], [ [[TMP6]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP12:%.*]] = phi <1 x i32> [ [[TMP2]], [[THEN]] ], [ [[TMP7]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP13:%.*]] = phi <1 x i32> [ [[TMP3]], [[THEN]] ], [ [[TMP8]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP14:%.*]] = phi <1 x i32> [ [[TMP4]], [[THEN]] ], [ [[TMP9]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP15:%.*]] = phi i8 [ [[LARGEPHI_EXTRACTSLICE10]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE11]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP16:%.*]] = phi i8 [ [[LARGEPHI_EXTRACTSLICE12]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE13]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP17:%.*]] = phi i8 [ [[LARGEPHI_EXTRACTSLICE14]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE15]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP18:%.*]] = bitcast <1 x i32> [[TMP14]] to <4 x i8>
+; OPT-NEXT:    [[TMP19:%.*]] = bitcast <1 x i32> [[TMP13]] to <4 x i8>
+; OPT-NEXT:    [[TMP20:%.*]] = bitcast <1 x i32> [[TMP12]] to <4 x i8>
+; OPT-NEXT:    [[TMP21:%.*]] = bitcast <1 x i32> [[TMP11]] to <4 x i8>
+; OPT-NEXT:    [[TMP22:%.*]] = bitcast <1 x i32> [[TMP10]] to <4 x i8>
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE0:%.*]] = call <23 x i8> @llvm.vector.insert.v23i8.v4i8(<23 x i8> poison, <4 x i8> [[TMP22]], i64 0)
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE1:%.*]] = call <23 x i8> @llvm.vector.insert.v23i8.v4i8(<23 x i8> [[LARGEPHI_INSERTSLICE0]], <4 x i8> [[TMP21]], i64 4)
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE2:%.*]] = call <23 x i8> @llvm.vector.insert.v23i8.v4i8(<23 x i8> [[LARGEPHI_INSERTSLICE1]], <4 x i8> [[TMP20]], i64 8)
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE3:%.*]] = call <23 x i8> @llvm.vector.insert.v23i8.v4i8(<23 x i8> [[LARGEPHI_INSERTSLICE2]], <4 x i8> [[TMP19]], i64 12)
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE4:%.*]] = call <23 x i8> @llvm.vector.insert.v23i8.v4i8(<23 x i8> [[LARGEPHI_INSERTSLICE3]], <4 x i8> [[TMP18]], i64 16)
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE5:%.*]] = insertelement <23 x i8> [[LARGEPHI_INSERTSLICE4]], i8 [[TMP15]], i64 20
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE6:%.*]] = insertelement <23 x i8> [[LARGEPHI_INSERTSLICE5]], i8 [[TMP16]], i64 21
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE7:%.*]] = insertelement <23 x i8> [[LARGEPHI_INSERTSLICE6]], i8 [[TMP17]], i64 22
 ; OPT-NEXT:    store <23 x i8> [[LARGEPHI_INSERTSLICE7]], ptr [[OUT:%.*]], align 1
 ; OPT-NEXT:    ret void
 ;
@@ -539,13 +554,19 @@ define amdgpu_kernel void @phi_v23i8(<23 x i8> %in, ptr %out, i1 %cond) {
 ; NOOPT-NEXT:    br i1 [[COND:%.*]], label [[THEN:%.*]], label [[ELSE:%.*]]
 ; NOOPT:       then:
 ; NOOPT-NEXT:    [[X:%.*]] = insertelement <23 x i8> [[IN:%.*]], i8 42, i32 3
+; NOOPT-NEXT:    [[TMP0:%.*]] = shufflevector <23 x i8> [[X]], <23 x i8> poison, <24 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; NOOPT-NEXT:    [[TMP1:%.*]] = bitcast <24 x i8> [[TMP0]] to <6 x i32>
 ; NOOPT-NEXT:    br label [[FINALLY:%.*]]
 ; NOOPT:       else:
 ; NOOPT-NEXT:    [[Y:%.*]] = insertelement <23 x i8> [[IN]], i8 64, i32 6
+; NOOPT-NEXT:    [[TMP2:%.*]] = shufflevector <23 x i8> [[Y]], <23 x i8> poison, <24 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; NOOPT-NEXT:    [[TMP3:%.*]] = bitcast <24 x i8> [[TMP2]] to <6 x i32>
 ; NOOPT-NEXT:    br label [[FINALLY]]
 ; NOOPT:       finally:
-; NOOPT-NEXT:    [[VAL:%.*]] = phi <23 x i8> [ [[X]], [[THEN]] ], [ [[Y]], [[ELSE]] ]
-; NOOPT-NEXT:    store <23 x i8> [[VAL]], ptr [[OUT:%.*]], align 1
+; NOOPT-NEXT:    [[TMP4:%.*]] = phi <6 x i32> [ [[TMP1]], [[THEN]] ], [ [[TMP3]], [[ELSE]] ]
+; NOOPT-NEXT:    [[TMP5:%.*]] = bitcast <6 x i32> [[TMP4]] to <24 x i8>
+; NOOPT-NEXT:    [[TMP6:%.*]] = shufflevector <24 x i8> [[TMP5]], <24 x i8> poison, <23 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22>
+; NOOPT-NEXT:    store <23 x i8> [[TMP6]], ptr [[OUT:%.*]], align 1
 ; NOOPT-NEXT:    ret void
 ;
 entry:
@@ -572,31 +593,36 @@ define amdgpu_kernel void @phi_v23i8_zeroinit(<23 x i8> %in, ptr %out, i1 %cond)
 ; OPT:       else:
 ; OPT-NEXT:    [[Y:%.*]] = insertelement <23 x i8> [[IN:%.*]], i8 64, i32 6
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE1:%.*]] = shufflevector <23 x i8> [[Y]], <23 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; OPT-NEXT:    [[TMP0:%.*]] = bitcast <4 x i8> [[LARGEPHI_EXTRACTSLICE1]] to <1 x i32>
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE3:%.*]] = shufflevector <23 x i8> [[Y]], <23 x i8> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; OPT-NEXT:    [[TMP1:%.*]] = bitcast <4 x i8> [[LARGEPHI_EXTRACTSLICE3]] to <1 x i32>
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE5:%.*]] = shufflevector <23 x i8> [[Y]], <23 x i8> poison, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; OPT-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[LARGEPHI_EXTRACTSLICE5]] to <1 x i32>
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE7:%.*]] = shufflevector <23 x i8> [[Y]], <23 x i8> poison, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; OPT-NEXT:    [[TMP3:%.*]] = bitcast <4 x i8> [[LARGEPHI_EXTRACTSLICE7]] to <1 x i32>
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE9:%.*]] = shufflevector <23 x i8> [[Y]], <23 x i8> poison, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; OPT-NEXT:    [[TMP4:%.*]] = bitcast <4 x i8> [[LARGEPHI_EXTRACTSLICE9]] to <1 x i32>
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE11:%.*]] = extractelement <23 x i8> [[Y]], i64 20
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE13:%.*]] = extractelement <23 x i8> [[Y]], i64 21
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE15:%.*]] = extractelement <23 x i8> [[Y]], i64 22
 ; OPT-NEXT:    br label [[FINALLY]]
 ; OPT:       finally:
-; OPT-NEXT:    [[TMP0:%.*]] = phi <4 x i8> [ zeroinitializer, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE1]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP1:%.*]] = phi <4 x i8> [ zeroinitializer, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE3]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP2:%.*]] = phi <4 x i8> [ zeroinitializer, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE5]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP3:%.*]] = phi <4 x i8> [ zeroinitializer, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE7]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP4:%.*]] = phi <4 x i8> [ zeroinitializer, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE9]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP5:%.*]] = phi i8 [ 0, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE11]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP6:%.*]] = phi i8 [ 0, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE13]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP7:%.*]] = phi i8 [ 0, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE15]], [[ELSE]] ]
-; OPT-NEXT:    [[LARGEPHI_INSERTSLICE0:%.*]] = call <23 x i8> @llvm.vector.insert.v23i8.v4i8(<23 x i8> poison, <4 x i8> [[TMP0]], i64 0)
-; OPT-NEXT:    [[LARGEPHI_INSERTSLICE1:%.*]] = call <23 x i8> @llvm.vector.insert.v23i8.v4i8(<23 x i8> [[LARGEPHI_INSERTSLICE0]], <4 x i8> [[TMP1]], i64 4)
-; OPT-NEXT:    [[LARGEPHI_INSERTSLICE2:%.*]] = call <23 x i8> @llvm.vector.insert.v23i8.v4i8(<23 x i8> [[LARGEPHI_INSERTSLICE1]], <4 x i8> [[TMP2]], i64 8)
-; OPT-NEXT:    [[LARGEPHI_INSERTSLICE3:%.*]] = call <23 x i8> @llvm.vector.insert.v23i8.v4i8(<23 x i8> [[LARGEPHI_INSERTSLICE2]], <4 x i8> [[TMP3]], i64 12)
-; OPT-NEXT:    [[LARGEPHI_INSERTSLICE4:%.*]] = call <23 x i8> @llvm.vector.insert.v23i8.v4i8(<23 x i8> [[LARGEPHI_INSERTSLICE3]], <4 x i8> [[TMP4]], i64 16)
-; OPT-NEXT:    [[LARGEPHI_INSERTSLICE5:%.*]] = insertelement <23 x i8> [[LARGEPHI_INSERTSLICE4]], i8 [[TMP5]], i64 20
-; OPT-NEXT:    [[LARGEPHI_INSERTSLICE6:%.*]] = insertelement <23 x i8> [[LARGEPHI_INSERTSLICE5]], i8 [[TMP6]], i64 21
-; OPT-NEXT:    [[LARGEPHI_INSERTSLICE7:%.*]] = insertelement <23 x i8> [[LARGEPHI_INSERTSLICE6]], i8 [[TMP7]], i64 22
+; OPT-NEXT:    [[TMP5:%.*]] = phi <4 x i8> [ zeroinitializer, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE1]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP6:%.*]] = phi <4 x i8> [ zeroinitializer, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE3]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP7:%.*]] = phi <4 x i8> [ zeroinitializer, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE5]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP8:%.*]] = phi <4 x i8> [ zeroinitializer, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE7]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP9:%.*]] = phi <4 x i8> [ zeroinitializer, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE9]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP10:%.*]] = phi i8 [ 0, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE11]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP11:%.*]] = phi i8 [ 0, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE13]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP12:%.*]] = phi i8 [ 0, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE15]], [[ELSE]] ]
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE0:%.*]] = call <23 x i8> @llvm.vector.insert.v23i8.v4i8(<23 x i8> poison, <4 x i8> [[TMP5]], i64 0)
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE1:%.*]] = call <23 x i8> @llvm.vector.insert.v23i8.v4i8(<23 x i8> [[LARGEPHI_INSERTSLICE0]], <4 x i8> [[TMP6]], i64 4)
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE2:%.*]] = call <23 x i8> @llvm.vector.insert.v23i8.v4i8(<23 x i8> [[LARGEPHI_INSERTSLICE1]], <4 x i8> [[TMP7]], i64 8)
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE3:%.*]] = call <23 x i8> @llvm.vector.insert.v23i8.v4i8(<23 x i8> [[LARGEPHI_INSERTSLICE2]], <4 x i8> [[TMP8]], i64 12)
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE4:%.*]] = call <23 x i8> @llvm.vector.insert.v23i8.v4i8(<23 x i8> [[LARGEPHI_INSERTSLICE3]], <4 x i8> [[TMP9]], i64 16)
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE5:%.*]] = insertelement <23 x i8> [[LARGEPHI_INSERTSLICE4]], i8 [[TMP10]], i64 20
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE6:%.*]] = insertelement <23 x i8> [[LARGEPHI_INSERTSLICE5]], i8 [[TMP11]], i64 21
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE7:%.*]] = insertelement <23 x i8> [[LARGEPHI_INSERTSLICE6]], i8 [[TMP12]], i64 22
 ; OPT-NEXT:    store <23 x i8> [[LARGEPHI_INSERTSLICE7]], ptr [[OUT:%.*]], align 1
 ; OPT-NEXT:    ret void
 ;
@@ -607,6 +633,8 @@ define amdgpu_kernel void @phi_v23i8_zeroinit(<23 x i8> %in, ptr %out, i1 %cond)
 ; NOOPT-NEXT:    br label [[FINALLY:%.*]]
 ; NOOPT:       else:
 ; NOOPT-NEXT:    [[Y:%.*]] = insertelement <23 x i8> [[IN:%.*]], i8 64, i32 6
+; NOOPT-NEXT:    [[TMP0:%.*]] = shufflevector <23 x i8> [[Y]], <23 x i8> poison, <24 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; NOOPT-NEXT:    [[TMP1:%.*]] = bitcast <24 x i8> [[TMP0]] to <6 x i32>
 ; NOOPT-NEXT:    br label [[FINALLY]]
 ; NOOPT:       finally:
 ; NOOPT-NEXT:    [[VAL:%.*]] = phi <23 x i8> [ zeroinitializer, [[THEN]] ], [ [[Y]], [[ELSE]] ]
@@ -635,25 +663,28 @@ define amdgpu_kernel void @phi_v15i8_random_constant_init(<15 x i8> %in, ptr %ou
 ; OPT:       else:
 ; OPT-NEXT:    [[Y:%.*]] = insertelement <15 x i8> [[IN:%.*]], i8 64, i32 6
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE1:%.*]] = shufflevector <15 x i8> [[Y]], <15 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; OPT-NEXT:    [[TMP0:%.*]] = bitcast <4 x i8> [[LARGEPHI_EXTRACTSLICE1]] to <1 x i32>
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE3:%.*]] = shufflevector <15 x i8> [[Y]], <15 x i8> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; OPT-NEXT:    [[TMP1:%.*]] = bitcast <4 x i8> [[LARGEPHI_EXTRACTSLICE3]] to <1 x i32>
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE5:%.*]] = shufflevector <15 x i8> [[Y]], <15 x i8> poison, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; OPT-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[LARGEPHI_EXTRACTSLICE5]] to <1 x i32>
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE7:%.*]] = extractelement <15 x i8> [[Y]], i64 12
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE9:%.*]] = extractelement <15 x i8> [[Y]], i64 13
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE11:%.*]] = extractelement <15 x i8> [[Y]], i64 14
 ; OPT-NEXT:    br label [[FINALLY]]
 ; OPT:       finally:
-; OPT-NEXT:    [[TMP0:%.*]] = phi <4 x i8> [ <i8 poison, i8 1, i8 2, i8 3>, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE1]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP1:%.*]] = phi <4 x i8> [ <i8 4, i8 undef, i8 6, i8 7>, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE3]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP2:%.*]] = phi <4 x i8> [ <i8 9, i8 10, i8 11, i8 12>, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE5]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP3:%.*]] = phi i8 [ 13, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE7]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP4:%.*]] = phi i8 [ 14, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE9]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP5:%.*]] = phi i8 [ undef, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE11]], [[ELSE]] ]
-; OPT-NEXT:    [[LARGEPHI_INSERTSLICE0:%.*]] = call <15 x i8> @llvm.vector.insert.v15i8.v4i8(<15 x i8> poison, <4 x i8> [[TMP0]], i64 0)
-; OPT-NEXT:    [[LARGEPHI_INSERTSLICE1:%.*]] = call <15 x i8> @llvm.vector.insert.v15i8.v4i8(<15 x i8> [[LARGEPHI_INSERTSLICE0]], <4 x i8> [[TMP1]], i64 4)
-; OPT-NEXT:    [[LARGEPHI_INSERTSLICE2:%.*]] = call <15 x i8> @llvm.vector.insert.v15i8.v4i8(<15 x i8> [[LARGEPHI_INSERTSLICE1]], <4 x i8> [[TMP2]], i64 8)
-; OPT-NEXT:    [[LARGEPHI_INSERTSLICE3:%.*]] = insertelement <15 x i8> [[LARGEPHI_INSERTSLICE2]], i8 [[TMP3]], i64 12
-; OPT-NEXT:    [[LARGEPHI_INSERTSLICE4:%.*]] = insertelement <15 x i8> [[LARGEPHI_INSERTSLICE3]], i8 [[TMP4]], i64 13
-; OPT-NEXT:    [[LARGEPHI_INSERTSLICE5:%.*]] = insertelement <15 x i8> [[LARGEPHI_INSERTSLICE4]], i8 [[TMP5]], i64 14
+; OPT-NEXT:    [[TMP3:%.*]] = phi <4 x i8> [ <i8 poison, i8 1, i8 2, i8 3>, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE1]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP4:%.*]] = phi <4 x i8> [ <i8 4, i8 undef, i8 6, i8 7>, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE3]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP5:%.*]] = phi <4 x i8> [ <i8 9, i8 10, i8 11, i8 12>, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE5]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP6:%.*]] = phi i8 [ 13, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE7]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP7:%.*]] = phi i8 [ 14, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE9]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP8:%.*]] = phi i8 [ undef, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE11]], [[ELSE]] ]
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE0:%.*]] = call <15 x i8> @llvm.vector.insert.v15i8.v4i8(<15 x i8> poison, <4 x i8> [[TMP3]], i64 0)
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE1:%.*]] = call <15 x i8> @llvm.vector.insert.v15i8.v4i8(<15 x i8> [[LARGEPHI_INSERTSLICE0]], <4 x i8> [[TMP4]], i64 4)
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE2:%.*]] = call <15 x i8> @llvm.vector.insert.v15i8.v4i8(<15 x i8> [[LARGEPHI_INSERTSLICE1]], <4 x i8> [[TMP5]], i64 8)
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE3:%.*]] = insertelement <15 x i8> [[LARGEPHI_INSERTSLICE2]], i8 [[TMP6]], i64 12
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE4:%.*]] = insertelement <15 x i8> [[LARGEPHI_INSERTSLICE3]], i8 [[TMP7]], i64 13
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE5:%.*]] = insertelement <15 x i8> [[LARGEPHI_INSERTSLICE4]], i8 [[TMP8]], i64 14
 ; OPT-NEXT:    store <15 x i8> [[LARGEPHI_INSERTSLICE5]], ptr [[OUT:%.*]], align 1
 ; OPT-NEXT:    ret void
 ;
@@ -664,6 +695,8 @@ define amdgpu_kernel void @phi_v15i8_random_constant_init(<15 x i8> %in, ptr %ou
 ; NOOPT-NEXT:    br label [[FINALLY:%.*]]
 ; NOOPT:       else:
 ; NOOPT-NEXT:    [[Y:%.*]] = insertelement <15 x i8> [[IN:%.*]], i8 64, i32 6
+; NOOPT-NEXT:    [[TMP0:%.*]] = shufflevector <15 x i8> [[Y]], <15 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; NOOPT-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
 ; NOOPT-NEXT:    br label [[FINALLY]]
 ; NOOPT:       finally:
 ; NOOPT-NEXT:    [[VAL:%.*]] = phi <15 x i8> [ <i8 poison, i8 1, i8 2, i8 3, i8 4, i8 undef, i8 6, i8 7, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 undef>, [[THEN]] ], [ [[Y]], [[ELSE]] ]
diff --git a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
index b8d18f56b760239..1f4199708dc648d 100644
--- a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
+++ b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
@@ -10,10 +10,10 @@ define amdgpu_kernel void @v3i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1)
 ; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v0
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX906-NEXT:    global_load_dword v2, v1, s[4:5]
+; GFX906-NEXT:    s_mov_b32 s4, 0xc060504
 ; GFX906-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 8, v2
+; GFX906-NEXT:    v_perm_b32 v2, v2, v2, s4
 ; GFX906-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX906-NEXT:    s_cbranch_execz .LBB0_2
 ; GFX906-NEXT:  ; %bb.1: ; %bb.1
@@ -21,16 +21,13 @@ define amdgpu_kernel void @v3i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1)
 ; GFX906-NEXT:    v_mov_b32_e32 v0, s7
 ; GFX906-NEXT:    v_add_co_u32_e32 v2, vcc, s6, v2
 ; GFX906-NEXT:    v_addc_co_u32_e32 v3, vcc, v0, v3, vcc
-; GFX906-NEXT:    global_load_dword v2, v[2:3], off
+; GFX906-NEXT:    global_load_dword v0, v[2:3], off
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 8, v2
+; GFX906-NEXT:    v_perm_b32 v2, v0, v0, s4
 ; GFX906-NEXT:  .LBB0_2: ; %bb.2
 ; GFX906-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v4
-; GFX906-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    global_store_byte v1, v3, s[2:3] offset:2
-; GFX906-NEXT:    global_store_short v1, v0, s[2:3]
+; GFX906-NEXT:    global_store_byte_d16_hi v1, v2, s[2:3] offset:2
+; GFX906-NEXT:    global_store_short v1, v2, s[2:3]
 ; GFX906-NEXT:    s_endpgm
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -59,30 +56,19 @@ define amdgpu_kernel void @v4i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1)
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX906-NEXT:    global_load_dword v2, v1, s[4:5]
 ; GFX906-NEXT:    v_mov_b32_e32 v1, 0
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v2
-; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
-; GFX906-NEXT:    v_lshrrev_b32_e32 v5, 8, v2
 ; GFX906-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX906-NEXT:    s_cbranch_execz .LBB1_2
 ; GFX906-NEXT:  ; %bb.1: ; %bb.1
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    v_lshlrev_b64 v[2:3], 2, v[0:1]
 ; GFX906-NEXT:    v_mov_b32_e32 v0, s7
 ; GFX906-NEXT:    v_add_co_u32_e32 v2, vcc, s6, v2
 ; GFX906-NEXT:    v_addc_co_u32_e32 v3, vcc, v0, v3, vcc
 ; GFX906-NEXT:    global_load_dword v2, v[2:3], off
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v2
-; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
-; GFX906-NEXT:    v_lshrrev_b32_e32 v5, 8, v2
 ; GFX906-NEXT:  .LBB1_2: ; %bb.2
 ; GFX906-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v5
-; GFX906-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v3
-; GFX906-NEXT:    v_or_b32_sdwa v2, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    global_store_dword v1, v0, s[2:3]
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    global_store_dword v1, v2, s[2:3]
 ; GFX906-NEXT:    s_endpgm
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -110,11 +96,10 @@ define amdgpu_kernel void @v5i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1)
 ; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v0
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX906-NEXT:    global_load_dwordx2 v[2:3], v1, s[4:5]
+; GFX906-NEXT:    s_mov_b32 s4, 0
 ; GFX906-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshrrev_b64 v[4:5], 24, v[2:3]
-; GFX906-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
-; GFX906-NEXT:    v_lshrrev_b32_e32 v6, 8, v2
+; GFX906-NEXT:    v_or_b32_sdwa v3, s4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX906-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX906-NEXT:    s_cbranch_execz .LBB2_2
 ; GFX906-NEXT:  ; %bb.1: ; %bb.1
@@ -124,18 +109,11 @@ define amdgpu_kernel void @v5i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1)
 ; GFX906-NEXT:    v_addc_co_u32_e32 v3, vcc, v0, v3, vcc
 ; GFX906-NEXT:    global_load_dwordx2 v[2:3], v[2:3], off
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshrrev_b64 v[4:5], 24, v[2:3]
-; GFX906-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
-; GFX906-NEXT:    v_lshrrev_b32_e32 v6, 8, v2
+; GFX906-NEXT:    v_or_b32_sdwa v3, s4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX906-NEXT:  .LBB2_2: ; %bb.2
 ; GFX906-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v6
-; GFX906-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v4
-; GFX906-NEXT:    v_or_b32_sdwa v2, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX906-NEXT:    global_store_byte v1, v3, s[2:3] offset:4
-; GFX906-NEXT:    global_store_dword v1, v0, s[2:3]
+; GFX906-NEXT:    global_store_dword v1, v2, s[2:3]
 ; GFX906-NEXT:    s_endpgm
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -164,40 +142,18 @@ define amdgpu_kernel void @v8i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1)
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX906-NEXT:    global_load_dwordx2 v[2:3], v1, s[4:5]
 ; GFX906-NEXT:    v_mov_b32_e32 v1, 0
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 24, v3
-; GFX906-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
-; GFX906-NEXT:    v_lshrrev_b32_e32 v6, 8, v3
-; GFX906-NEXT:    v_lshrrev_b32_e32 v7, 24, v2
-; GFX906-NEXT:    v_lshrrev_b32_e32 v8, 16, v2
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v2
 ; GFX906-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX906-NEXT:    s_cbranch_execz .LBB3_2
 ; GFX906-NEXT:  ; %bb.1: ; %bb.1
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    v_lshlrev_b64 v[2:3], 3, v[0:1]
 ; GFX906-NEXT:    v_mov_b32_e32 v0, s7
 ; GFX906-NEXT:    v_add_co_u32_e32 v2, vcc, s6, v2
 ; GFX906-NEXT:    v_addc_co_u32_e32 v3, vcc, v0, v3, vcc
 ; GFX906-NEXT:    global_load_dwordx2 v[2:3], v[2:3], off
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 24, v3
-; GFX906-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
-; GFX906-NEXT:    v_lshrrev_b32_e32 v6, 8, v3
-; GFX906-NEXT:    v_lshrrev_b32_e32 v7, 24, v2
-; GFX906-NEXT:    v_lshrrev_b32_e32 v8, 16, v2
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v2
 ; GFX906-NEXT:  .LBB3_2: ; %bb.2
 ; GFX906-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v9
-; GFX906-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v7
-; GFX906-NEXT:    v_or_b32_sdwa v2, v8, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v6
-; GFX906-NEXT:    v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v4
-; GFX906-NEXT:    v_or_b32_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    global_store_dwordx2 v1, v[2:3], s[2:3]
 ; GFX906-NEXT:    s_endpgm
 entry:
@@ -227,62 +183,18 @@ define amdgpu_kernel void @v16i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX906-NEXT:    global_load_dwordx4 v[2:5], v1, s[4:5]
 ; GFX906-NEXT:    v_mov_b32_e32 v1, 0
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v6, 24, v5
-; GFX906-NEXT:    v_lshrrev_b32_e32 v7, 16, v5
-; GFX906-NEXT:    v_lshrrev_b32_e32 v8, 8, v5
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v4
-; GFX906-NEXT:    v_lshrrev_b32_e32 v10, 16, v4
-; GFX906-NEXT:    v_lshrrev_b32_e32 v11, 8, v4
-; GFX906-NEXT:    v_lshrrev_b32_e32 v12, 24, v3
-; GFX906-NEXT:    v_lshrrev_b32_e32 v13, 16, v3
-; GFX906-NEXT:    v_lshrrev_b32_e32 v14, 8, v3
-; GFX906-NEXT:    v_lshrrev_b32_e32 v15, 24, v2
-; GFX906-NEXT:    v_lshrrev_b32_e32 v16, 16, v2
-; GFX906-NEXT:    v_lshrrev_b32_e32 v17, 8, v2
 ; GFX906-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX906-NEXT:    s_cbranch_execz .LBB4_2
 ; GFX906-NEXT:  ; %bb.1: ; %bb.1
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    v_lshlrev_b64 v[2:3], 4, v[0:1]
 ; GFX906-NEXT:    v_mov_b32_e32 v0, s7
 ; GFX906-NEXT:    v_add_co_u32_e32 v2, vcc, s6, v2
 ; GFX906-NEXT:    v_addc_co_u32_e32 v3, vcc, v0, v3, vcc
 ; GFX906-NEXT:    global_load_dwordx4 v[2:5], v[2:3], off
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v6, 24, v5
-; GFX906-NEXT:    v_lshrrev_b32_e32 v7, 16, v5
-; GFX906-NEXT:    v_lshrrev_b32_e32 v8, 8, v5
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v4
-; GFX906-NEXT:    v_lshrrev_b32_e32 v10, 16, v4
-; GFX906-NEXT:    v_lshrrev_b32_e32 v11, 8, v4
-; GFX906-NEXT:    v_lshrrev_b32_e32 v12, 24, v3
-; GFX906-NEXT:    v_lshrrev_b32_e32 v13, 16, v3
-; GFX906-NEXT:    v_lshrrev_b32_e32 v14, 8, v3
-; GFX906-NEXT:    v_lshrrev_b32_e32 v15, 24, v2
-; GFX906-NEXT:    v_lshrrev_b32_e32 v16, 16, v2
-; GFX906-NEXT:    v_lshrrev_b32_e32 v17, 8, v2
 ; GFX906-NEXT:  .LBB4_2: ; %bb.2
 ; GFX906-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v17
-; GFX906-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v15
-; GFX906-NEXT:    v_or_b32_sdwa v2, v16, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v14
-; GFX906-NEXT:    v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v12
-; GFX906-NEXT:    v_or_b32_sdwa v3, v13, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v11
-; GFX906-NEXT:    v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_lshlrev_b16_e32 v4, 8, v9
-; GFX906-NEXT:    v_or_b32_sdwa v4, v10, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v8
-; GFX906-NEXT:    v_or_b32_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_lshlrev_b16_e32 v5, 8, v6
-; GFX906-NEXT:    v_or_b32_sdwa v5, v7, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v5, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    global_store_dwordx4 v1, v[2:5], s[2:3]
 ; GFX906-NEXT:    s_endpgm
 entry:
@@ -306,118 +218,28 @@ define amdgpu_kernel void @v32i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1
 ; GFX906-LABEL: v32i8_liveout:
 ; GFX906:       ; %bb.0: ; %entry
 ; GFX906-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX906-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
 ; GFX906-NEXT:    v_lshlrev_b32_e32 v1, 5, v0
-; GFX906-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v0
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX906-NEXT:    global_load_dwordx4 v[2:5], v1, s[4:5] offset:16
-; GFX906-NEXT:    global_load_dwordx4 v[6:9], v1, s[4:5]
+; GFX906-NEXT:    global_load_dwordx4 v[4:7], v1, s[4:5] offset:16
+; GFX906-NEXT:    global_load_dwordx4 v[8:11], v1, s[4:5]
 ; GFX906-NEXT:    v_mov_b32_e32 v1, 0
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v10, 24, v5
-; GFX906-NEXT:    v_lshrrev_b32_e32 v11, 16, v5
-; GFX906-NEXT:    v_lshrrev_b32_e32 v12, 8, v5
-; GFX906-NEXT:    v_lshrrev_b32_e32 v13, 24, v4
-; GFX906-NEXT:    v_lshrrev_b32_e32 v14, 16, v4
-; GFX906-NEXT:    v_lshrrev_b32_e32 v15, 8, v4
-; GFX906-NEXT:    v_lshrrev_b32_e32 v16, 24, v3
-; GFX906-NEXT:    v_lshrrev_b32_e32 v17, 16, v3
-; GFX906-NEXT:    v_lshrrev_b32_e32 v18, 8, v3
-; GFX906-NEXT:    v_lshrrev_b32_e32 v19, 24, v2
-; GFX906-NEXT:    v_lshrrev_b32_e32 v20, 16, v2
-; GFX906-NEXT:    v_lshrrev_b32_e32 v21, 8, v2
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v22, 24, v9
-; GFX906-NEXT:    v_lshrrev_b32_e32 v23, 16, v9
-; GFX906-NEXT:    v_lshrrev_b32_e32 v24, 8, v9
-; GFX906-NEXT:    v_lshrrev_b32_e32 v25, 24, v8
-; GFX906-NEXT:    v_lshrrev_b32_e32 v26, 16, v8
-; GFX906-NEXT:    v_lshrrev_b32_e32 v27, 8, v8
-; GFX906-NEXT:    v_lshrrev_b32_e32 v28, 24, v7
-; GFX906-NEXT:    v_lshrrev_b32_e32 v30, 16, v7
-; GFX906-NEXT:    v_lshrrev_b32_e32 v29, 8, v7
-; GFX906-NEXT:    v_lshrrev_b32_e32 v32, 24, v6
-; GFX906-NEXT:    v_lshrrev_b32_e32 v33, 16, v6
-; GFX906-NEXT:    v_lshrrev_b32_e32 v31, 8, v6
-; GFX906-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX906-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX906-NEXT:    s_cbranch_execz .LBB5_2
 ; GFX906-NEXT:  ; %bb.1: ; %bb.1
 ; GFX906-NEXT:    v_lshlrev_b64 v[2:3], 5, v[0:1]
 ; GFX906-NEXT:    v_mov_b32_e32 v0, s7
-; GFX906-NEXT:    v_add_co_u32_e32 v10, vcc, s6, v2
-; GFX906-NEXT:    v_addc_co_u32_e32 v11, vcc, v0, v3, vcc
-; GFX906-NEXT:    global_load_dwordx4 v[2:5], v[10:11], off offset:16
-; GFX906-NEXT:    global_load_dwordx4 v[6:9], v[10:11], off
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v10, 24, v5
-; GFX906-NEXT:    v_lshrrev_b32_e32 v11, 16, v5
-; GFX906-NEXT:    v_lshrrev_b32_e32 v12, 8, v5
-; GFX906-NEXT:    v_lshrrev_b32_e32 v13, 24, v4
-; GFX906-NEXT:    v_lshrrev_b32_e32 v14, 16, v4
-; GFX906-NEXT:    v_lshrrev_b32_e32 v15, 8, v4
-; GFX906-NEXT:    v_lshrrev_b32_e32 v16, 24, v3
-; GFX906-NEXT:    v_lshrrev_b32_e32 v17, 16, v3
-; GFX906-NEXT:    v_lshrrev_b32_e32 v18, 8, v3
-; GFX906-NEXT:    v_lshrrev_b32_e32 v19, 24, v2
-; GFX906-NEXT:    v_lshrrev_b32_e32 v20, 16, v2
-; GFX906-NEXT:    v_lshrrev_b32_e32 v21, 8, v2
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v22, 24, v9
-; GFX906-NEXT:    v_lshrrev_b32_e32 v23, 16, v9
-; GFX906-NEXT:    v_lshrrev_b32_e32 v24, 8, v9
-; GFX906-NEXT:    v_lshrrev_b32_e32 v25, 24, v8
-; GFX906-NEXT:    v_lshrrev_b32_e32 v26, 16, v8
-; GFX906-NEXT:    v_lshrrev_b32_e32 v27, 8, v8
-; GFX906-NEXT:    v_lshrrev_b32_e32 v28, 24, v7
-; GFX906-NEXT:    v_lshrrev_b32_e32 v30, 16, v7
-; GFX906-NEXT:    v_lshrrev_b32_e32 v29, 8, v7
-; GFX906-NEXT:    v_lshrrev_b32_e32 v32, 24, v6
-; GFX906-NEXT:    v_lshrrev_b32_e32 v33, 16, v6
-; GFX906-NEXT:    v_lshrrev_b32_e32 v31, 8, v6
+; GFX906-NEXT:    v_add_co_u32_e32 v2, vcc, s6, v2
+; GFX906-NEXT:    v_addc_co_u32_e32 v3, vcc, v0, v3, vcc
+; GFX906-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off offset:16
+; GFX906-NEXT:    global_load_dwordx4 v[8:11], v[2:3], off
 ; GFX906-NEXT:  .LBB5_2: ; %bb.2
-; GFX906-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX906-NEXT:    v_lshlrev_b16_e32 v28, 8, v28
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v32
-; GFX906-NEXT:    v_or_b32_sdwa v28, v30, v28 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_lshlrev_b16_e32 v30, 8, v31
-; GFX906-NEXT:    v_or_b32_sdwa v0, v33, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v6, v6, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v6, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v27
-; GFX906-NEXT:    v_or_b32_sdwa v0, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_lshlrev_b16_e32 v8, 8, v25
-; GFX906-NEXT:    v_or_b32_sdwa v8, v26, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v8, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v24
-; GFX906-NEXT:    v_or_b32_sdwa v0, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_lshlrev_b16_e32 v9, 8, v22
-; GFX906-NEXT:    v_or_b32_sdwa v9, v23, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v9, v0, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v21
-; GFX906-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v19
-; GFX906-NEXT:    v_or_b32_sdwa v2, v20, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v18
-; GFX906-NEXT:    v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v16
-; GFX906-NEXT:    v_or_b32_sdwa v3, v17, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v15
-; GFX906-NEXT:    v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_lshlrev_b16_e32 v4, 8, v13
-; GFX906-NEXT:    v_or_b32_sdwa v4, v14, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v12
-; GFX906-NEXT:    v_lshlrev_b16_e32 v29, 8, v29
-; GFX906-NEXT:    v_or_b32_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_lshlrev_b16_e32 v5, 8, v10
-; GFX906-NEXT:    v_or_b32_sdwa v7, v7, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v5, v11, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v7, v7, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v5, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    global_store_dwordx4 v1, v[6:9], s[0:1]
-; GFX906-NEXT:    global_store_dwordx4 v1, v[2:5], s[0:1] offset:16
+; GFX906-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    global_store_dwordx4 v1, v[4:7], s[2:3] offset:16
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    global_store_dwordx4 v1, v[8:11], s[2:3]
 ; GFX906-NEXT:    s_endpgm
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -449,1537 +271,993 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace(
 ; GFX906-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; GFX906-NEXT:    s_addc_u32 s9, s9, 0
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX906-NEXT:    global_load_dwordx4 v[18:21], v2, s[4:5] offset:240
-; GFX906-NEXT:    global_load_dwordx4 v[6:9], v2, s[4:5] offset:224
-; GFX906-NEXT:    global_load_dwordx4 v[10:13], v2, s[4:5] offset:208
-; GFX906-NEXT:    global_load_dwordx4 v[14:17], v2, s[4:5] offset:192
+; GFX906-NEXT:    global_load_dwordx4 v[54:57], v2, s[4:5] offset:240
+; GFX906-NEXT:    global_load_dwordx4 v[38:41], v2, s[4:5] offset:224
+; GFX906-NEXT:    global_load_dwordx4 v[20:23], v2, s[4:5] offset:208
 ; GFX906-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v0
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v21
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:20 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v21
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:24 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v21
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:28 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v20
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:32 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v20
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:36 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v20
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:40 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v19
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:44 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v19
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:48 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v19
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:52 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v18
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:56 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v18
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:60 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v18, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    buffer_store_dword v19, off, s[8:11], 0 offset:8 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v20, off, s[8:11], 0 offset:12 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v21, off, s[8:11], 0 offset:16 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v18
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:64 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v9
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:68 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v9
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:72 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v9
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:76 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v8
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:80 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v8
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:84 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v8
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:88 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v7
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:92 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v7
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:96 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v7
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:100 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v6
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:104 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v6
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:108 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v6
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:112 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v13
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:116 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v13
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:120 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v13
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:124 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v12
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:128 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v12
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:132 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v12
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:136 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v11
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:140 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v11
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:144 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v11
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:148 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v10
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:152 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v10
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:156 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v10
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:160 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v17
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:164 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v17
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:168 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v17
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:180 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v16
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:172 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v16
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:176 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v16
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:192 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v15
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:184 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v15
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:188 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v15
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:204 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v14
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:196 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v14
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:200 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v14
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:208 ; 4-byte Folded Spill
-; GFX906-NEXT:    global_load_dwordx4 v[18:21], v2, s[4:5] offset:176
-; GFX906-NEXT:    global_load_dwordx4 v[22:25], v2, s[4:5] offset:160
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v21
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:212 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v21
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:216 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v21
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:228 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v20
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:220 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v20
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:224 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v20
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:240 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v19
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:232 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v19
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:236 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v19
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:252 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v18
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:244 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v18
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:248 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v18
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:256 ; 4-byte Folded Spill
-; GFX906-NEXT:    s_waitcnt vmcnt(12)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v25
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:260 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v25
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:264 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v25
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:276 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v24
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:268 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v24
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:272 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v24
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:288 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v23
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:280 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v23
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:284 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v23
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:300 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v22
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:292 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v22
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:296 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v22
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:304 ; 4-byte Folded Spill
-; GFX906-NEXT:    global_load_dwordx4 v[26:29], v2, s[4:5] offset:144
-; GFX906-NEXT:    global_load_dwordx4 v[30:33], v2, s[4:5] offset:128
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v29
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:308 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v29
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:312 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v29
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:324 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v28
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:316 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v28
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:320 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v28
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:336 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v27
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:328 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v27
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:332 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v27
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:348 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v26
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:340 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v26
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:344 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v26
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:352 ; 4-byte Folded Spill
-; GFX906-NEXT:    s_waitcnt vmcnt(12)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v33
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:356 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v33
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:360 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v33
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:372 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v32
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:364 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v32
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:368 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v32
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:384 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v31
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:376 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v31
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:380 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v31
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:396 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v30
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:388 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v30
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:392 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v30
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:388 ; 4-byte Folded Spill
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    buffer_store_dword v1, off, s[8:11], 0 offset:392 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:396 ; 4-byte Folded Spill
 ; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:400 ; 4-byte Folded Spill
-; GFX906-NEXT:    global_load_dwordx4 v[34:37], v2, s[4:5] offset:112
-; GFX906-NEXT:    global_load_dwordx4 v[38:41], v2, s[4:5] offset:96
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v37
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:404 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v37
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:408 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v37
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:420 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v36
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:412 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v36
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:416 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v36
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:432 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v35
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:424 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v35
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:428 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v35
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:444 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v34
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:436 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v34
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:440 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v34
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:448 ; 4-byte Folded Spill
-; GFX906-NEXT:    s_waitcnt vmcnt(12)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v41
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:452 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v41
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:456 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v41
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:468 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v40
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:460 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v40
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:464 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v40
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:480 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v39
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:472 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v39
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:476 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v39
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:492 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v38
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:484 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v38
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:488 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v38
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:496 ; 4-byte Folded Spill
-; GFX906-NEXT:    global_load_dwordx4 v[42:45], v2, s[4:5] offset:80
-; GFX906-NEXT:    global_load_dwordx4 v[46:49], v2, s[4:5] offset:64
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v45
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:500 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v45
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:504 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v45
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:516 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v44
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:508 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v44
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:512 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v44
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:528 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v43
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:520 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v43
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:524 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v43
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:540 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v42
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:532 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v42
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:536 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v42
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:544 ; 4-byte Folded Spill
-; GFX906-NEXT:    s_waitcnt vmcnt(12)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v49
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:548 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v49
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:552 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v49
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:564 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v48
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:556 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v48
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:560 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v48
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:576 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v47
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:568 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v47
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:572 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v47
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:588 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v46
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:580 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v46
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:584 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v46
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:592 ; 4-byte Folded Spill
-; GFX906-NEXT:    global_load_dwordx4 v[50:53], v2, s[4:5] offset:48
-; GFX906-NEXT:    global_load_dwordx4 v[54:57], v2, s[4:5] offset:32
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v53
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:596 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v53
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:600 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v53
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:612 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v52
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:604 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v52
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:608 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v52
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:624 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v51
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:616 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v51
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:620 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v51
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:636 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v50
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:628 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v50
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:632 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v50
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:640 ; 4-byte Folded Spill
-; GFX906-NEXT:    s_waitcnt vmcnt(12)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v57
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:644 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v57
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:648 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v57
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:660 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v56
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:652 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v56
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:656 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v56
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:672 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v55
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:664 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v55
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:668 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v55
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:684 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v54
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:676 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v54
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:680 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v54
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:688 ; 4-byte Folded Spill
-; GFX906-NEXT:    global_load_dwordx4 v[58:61], v2, s[4:5] offset:16
+; GFX906-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:404 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v5, off, s[8:11], 0 offset:408 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v6, off, s[8:11], 0 offset:412 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v7, off, s[8:11], 0 offset:416 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v8, off, s[8:11], 0 offset:420 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:424 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v10, off, s[8:11], 0 offset:428 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v11, off, s[8:11], 0 offset:432 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v12, off, s[8:11], 0 offset:436 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v13, off, s[8:11], 0 offset:440 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v14, off, s[8:11], 0 offset:444 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v15, off, s[8:11], 0 offset:448 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v16, off, s[8:11], 0 offset:452 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:456 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v18, off, s[8:11], 0 offset:460 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v19, off, s[8:11], 0 offset:464 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v20, off, s[8:11], 0 offset:468 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v21, off, s[8:11], 0 offset:472 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v22, off, s[8:11], 0 offset:476 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v23, off, s[8:11], 0 offset:480 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v24, off, s[8:11], 0 offset:484 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v25, off, s[8:11], 0 offset:488 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v26, off, s[8:11], 0 offset:492 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v27, off, s[8:11], 0 offset:496 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v28, off, s[8:11], 0 offset:500 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v29, off, s[8:11], 0 offset:504 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v30, off, s[8:11], 0 offset:508 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v31, off, s[8:11], 0 offset:512 ; 4-byte Folded Spill
+; GFX906-NEXT:    global_load_dwordx4 v[42:45], v2, s[4:5] offset:192
 ; GFX906-NEXT:    s_nop 0
-; GFX906-NEXT:    global_load_dwordx4 v[2:5], v2, s[4:5]
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v62, 24, v61
-; GFX906-NEXT:    buffer_store_dword v62, off, s[8:11], 0 offset:692 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v62, 16, v61
-; GFX906-NEXT:    buffer_store_dword v62, off, s[8:11], 0 offset:696 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v62, 8, v61
-; GFX906-NEXT:    buffer_store_dword v62, off, s[8:11], 0 offset:708 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v62, 24, v60
-; GFX906-NEXT:    buffer_store_dword v62, off, s[8:11], 0 offset:700 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v62, 16, v60
-; GFX906-NEXT:    buffer_store_dword v62, off, s[8:11], 0 offset:704 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v62, 8, v60
-; GFX906-NEXT:    buffer_store_dword v62, off, s[8:11], 0 offset:720 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v62, 24, v59
-; GFX906-NEXT:    buffer_store_dword v62, off, s[8:11], 0 offset:712 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v62, 16, v59
-; GFX906-NEXT:    buffer_store_dword v62, off, s[8:11], 0 offset:716 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v62, 8, v59
-; GFX906-NEXT:    buffer_store_dword v62, off, s[8:11], 0 offset:732 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v62, 24, v58
-; GFX906-NEXT:    buffer_store_dword v62, off, s[8:11], 0 offset:724 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v62, 16, v58
-; GFX906-NEXT:    buffer_store_dword v62, off, s[8:11], 0 offset:728 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v62, 8, v58
-; GFX906-NEXT:    buffer_store_dword v62, off, s[8:11], 0 offset:736 ; 4-byte Folded Spill
-; GFX906-NEXT:    s_waitcnt vmcnt(12)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v62, 24, v5
-; GFX906-NEXT:    buffer_store_dword v62, off, s[8:11], 0 offset:740 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v62, 16, v5
-; GFX906-NEXT:    buffer_store_dword v62, off, s[8:11], 0 offset:744 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v62, 8, v5
-; GFX906-NEXT:    buffer_store_dword v62, off, s[8:11], 0 offset:756 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v62, 24, v4
-; GFX906-NEXT:    buffer_store_dword v62, off, s[8:11], 0 offset:748 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v62, 16, v4
-; GFX906-NEXT:    buffer_store_dword v62, off, s[8:11], 0 offset:752 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v62, 8, v4
-; GFX906-NEXT:    buffer_store_dword v62, off, s[8:11], 0 offset:768 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v62, 24, v3
-; GFX906-NEXT:    v_lshrrev_b32_e32 v63, 24, v2
-; GFX906-NEXT:    buffer_store_dword v62, off, s[8:11], 0 offset:760 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v62, 16, v3
-; GFX906-NEXT:    buffer_store_dword v63, off, s[8:11], 0 offset:772 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v63, 16, v2
-; GFX906-NEXT:    buffer_store_dword v62, off, s[8:11], 0 offset:764 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v62, 8, v3
-; GFX906-NEXT:    buffer_store_dword v63, off, s[8:11], 0 offset:776 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v63, 8, v2
-; GFX906-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; GFX906-NEXT:    s_cbranch_execz .LBB6_2
-; GFX906-NEXT:  ; %bb.1: ; %bb.1
-; GFX906-NEXT:    v_lshlrev_b64 v[2:3], 3, v[0:1]
-; GFX906-NEXT:    v_mov_b32_e32 v0, s7
-; GFX906-NEXT:    v_add_co_u32_e32 v2, vcc, s6, v2
-; GFX906-NEXT:    v_addc_co_u32_e32 v3, vcc, v0, v3, vcc
-; GFX906-NEXT:    global_load_dwordx4 v[18:21], v[2:3], off offset:240
-; GFX906-NEXT:    global_load_dwordx4 v[6:9], v[2:3], off offset:224
-; GFX906-NEXT:    global_load_dwordx4 v[10:13], v[2:3], off offset:208
-; GFX906-NEXT:    global_load_dwordx4 v[14:17], v[2:3], off offset:192
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v21
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:20 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v21
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:24 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v21
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:28 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v20
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:32 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v20
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:36 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v20
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:40 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v19
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:44 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v19
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:48 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v19
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:52 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v18
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:56 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v18
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:60 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v18, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill
+; GFX906-NEXT:    global_load_dwordx4 v[12:15], v2, s[4:5] offset:176
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    buffer_store_dword v1, off, s[8:11], 0 offset:8 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:12 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:16 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:20 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v5, off, s[8:11], 0 offset:24 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v6, off, s[8:11], 0 offset:28 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v7, off, s[8:11], 0 offset:32 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v8, off, s[8:11], 0 offset:36 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:40 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v10, off, s[8:11], 0 offset:44 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v11, off, s[8:11], 0 offset:48 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v12, off, s[8:11], 0 offset:52 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v13, off, s[8:11], 0 offset:56 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v14, off, s[8:11], 0 offset:60 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v15, off, s[8:11], 0 offset:64 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v16, off, s[8:11], 0 offset:68 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:72 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v18, off, s[8:11], 0 offset:76 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v19, off, s[8:11], 0 offset:80 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v20, off, s[8:11], 0 offset:84 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v21, off, s[8:11], 0 offset:88 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v22, off, s[8:11], 0 offset:92 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v23, off, s[8:11], 0 offset:96 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v24, off, s[8:11], 0 offset:100 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v25, off, s[8:11], 0 offset:104 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v26, off, s[8:11], 0 offset:108 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v27, off, s[8:11], 0 offset:112 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v28, off, s[8:11], 0 offset:116 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v29, off, s[8:11], 0 offset:120 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v30, off, s[8:11], 0 offset:124 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v31, off, s[8:11], 0 offset:128 ; 4-byte Folded Spill
+; GFX906-NEXT:    global_load_dwordx4 v[8:11], v2, s[4:5] offset:160
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    buffer_store_dword v19, off, s[8:11], 0 offset:8 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v20, off, s[8:11], 0 offset:12 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v21, off, s[8:11], 0 offset:16 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v18
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:64 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v9
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:68 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v9
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:72 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v9
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:76 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v8
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:80 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v8
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:84 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v8
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:88 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v7
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:92 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v7
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:96 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v7
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:100 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v6
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:104 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v6
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:108 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v6
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:112 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v13
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:116 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v13
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:120 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v13
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:124 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v12
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:128 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v12
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:132 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v12
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:136 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v11
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:140 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v11
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:144 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v11
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:148 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v10
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:152 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v10
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:156 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v10
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:160 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v17
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:164 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v17
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:168 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v17
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:180 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v16
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:172 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v16
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:176 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v16
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:192 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v15
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:184 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v15
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:188 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v15
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:204 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v14
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:196 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v14
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:200 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v14
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:208 ; 4-byte Folded Spill
-; GFX906-NEXT:    global_load_dwordx4 v[18:21], v[2:3], off offset:176
-; GFX906-NEXT:    global_load_dwordx4 v[22:25], v[2:3], off offset:160
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v21
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:212 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v21
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:216 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v21
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:228 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v20
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:220 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v20
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:224 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v20
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:240 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v19
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:232 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v19
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:236 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v19
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:252 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v18
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:244 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v18
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:248 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v18
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:256 ; 4-byte Folded Spill
-; GFX906-NEXT:    s_waitcnt vmcnt(12)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v25
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:260 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v25
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:264 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v25
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:276 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v24
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:268 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v24
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:272 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v24
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:288 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v23
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:280 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v23
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:284 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v23
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:300 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v22
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:292 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v22
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:296 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v22
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:304 ; 4-byte Folded Spill
-; GFX906-NEXT:    global_load_dwordx4 v[26:29], v[2:3], off offset:144
-; GFX906-NEXT:    global_load_dwordx4 v[30:33], v[2:3], off offset:128
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v29
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:308 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v29
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:312 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v29
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:324 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v28
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:316 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v28
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:320 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v28
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:336 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v27
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:328 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v27
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:332 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v27
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:348 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v26
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:340 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v26
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:344 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v26
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:352 ; 4-byte Folded Spill
-; GFX906-NEXT:    s_waitcnt vmcnt(12)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v33
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:356 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v33
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:360 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v33
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:372 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v32
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:364 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v32
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:368 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v32
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:384 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v31
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:376 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v31
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:380 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v31
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:396 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v30
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:388 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v30
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:392 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v30
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:400 ; 4-byte Folded Spill
-; GFX906-NEXT:    global_load_dwordx4 v[34:37], v[2:3], off offset:112
-; GFX906-NEXT:    global_load_dwordx4 v[38:41], v[2:3], off offset:96
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v37
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:404 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v37
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:408 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v37
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:420 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v36
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:412 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v36
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:416 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v36
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:432 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v35
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:424 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v35
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:428 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v35
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:444 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v34
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:436 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v34
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:440 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v34
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:448 ; 4-byte Folded Spill
-; GFX906-NEXT:    s_waitcnt vmcnt(12)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v41
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:452 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v41
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:456 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v41
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:468 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v40
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:460 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v40
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:464 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v40
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:480 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v39
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:472 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v39
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:476 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v39
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:492 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v38
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:484 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v38
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:488 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v38
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:496 ; 4-byte Folded Spill
-; GFX906-NEXT:    global_load_dwordx4 v[42:45], v[2:3], off offset:80
-; GFX906-NEXT:    global_load_dwordx4 v[46:49], v[2:3], off offset:64
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v45
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:500 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v45
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:504 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v45
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:516 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v44
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:508 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v44
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:512 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v44
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:528 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v43
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:520 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v43
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:524 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v43
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:540 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v42
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:532 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v42
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:536 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v42
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:544 ; 4-byte Folded Spill
-; GFX906-NEXT:    s_waitcnt vmcnt(12)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v49
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:548 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v49
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:552 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v49
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:564 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v48
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:556 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v48
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:560 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v48
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:576 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v47
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:568 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v47
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:572 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v47
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:588 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v46
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:580 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v46
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:584 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v46
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:592 ; 4-byte Folded Spill
-; GFX906-NEXT:    global_load_dwordx4 v[50:53], v[2:3], off offset:48
-; GFX906-NEXT:    global_load_dwordx4 v[54:57], v[2:3], off offset:32
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v53
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:596 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v53
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:600 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v53
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:612 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v52
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:604 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v52
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:608 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v52
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:624 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v51
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:616 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v51
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:620 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v51
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:636 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v50
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:628 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v50
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:632 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v50
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:640 ; 4-byte Folded Spill
-; GFX906-NEXT:    s_waitcnt vmcnt(12)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v57
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:644 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v57
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:648 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v57
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:660 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v56
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:652 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v56
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:656 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v56
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:672 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v55
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:664 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v55
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:668 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v55
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:684 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v54
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:676 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v54
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:680 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v54
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:688 ; 4-byte Folded Spill
-; GFX906-NEXT:    global_load_dwordx4 v[58:61], v[2:3], off offset:16
-; GFX906-NEXT:    s_nop 0
-; GFX906-NEXT:    global_load_dwordx4 v[2:5], v[2:3], off
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v61
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:692 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v61
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:696 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v61
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:708 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v60
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:700 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v60
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:704 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v60
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:720 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v59
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:712 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v59
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:716 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v59
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:732 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v58
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:724 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v58
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:728 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v58
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:736 ; 4-byte Folded Spill
-; GFX906-NEXT:    s_waitcnt vmcnt(12)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v5
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:740 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v5
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:744 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v5
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:756 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v4
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:748 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v4
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:752 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v4
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:768 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v3
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:760 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v3
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:764 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v2
-; GFX906-NEXT:    v_lshrrev_b32_e32 v62, 8, v3
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:772 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
-; GFX906-NEXT:    v_lshrrev_b32_e32 v63, 8, v2
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:776 ; 4-byte Folded Spill
-; GFX906-NEXT:  .LBB6_2: ; %bb.2
-; GFX906-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v63
-; GFX906-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v62
-; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:768 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v62, off, s[8:11], 0 offset:776 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v63, off, s[8:11], 0 offset:764 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    v_or_b32_sdwa v4, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:756 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    v_or_b32_sdwa v5, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:772 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    v_or_b32_sdwa v2, v62, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v62, off, s[8:11], 0 offset:760 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:748 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v62, 8, v62
-; GFX906-NEXT:    v_or_b32_sdwa v62, v63, v62 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v62, off, s[8:11], 0 offset:752 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_or_b32_sdwa v0, v62, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v4, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:740 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v62, off, s[8:11], 0 offset:744 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_or_b32_sdwa v0, v62, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    global_store_dwordx4 v1, v[2:5], s[2:3]
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:732 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:736 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_or_b32_sdwa v3, v59, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:720 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v59, off, s[8:11], 0 offset:716 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX906-NEXT:    v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    v_or_b32_sdwa v4, v60, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:708 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v58, off, s[8:11], 0 offset:728 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    v_or_b32_sdwa v5, v61, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:724 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    v_or_b32_sdwa v2, v58, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v58, off, s[8:11], 0 offset:712 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:700 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v58, 8, v58
-; GFX906-NEXT:    v_or_b32_sdwa v58, v59, v58 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v58, off, s[8:11], 0 offset:704 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_or_b32_sdwa v0, v58, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v4, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:692 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v58, off, s[8:11], 0 offset:696 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_or_b32_sdwa v0, v58, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    global_store_dwordx4 v1, v[2:5], s[2:3] offset:16
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:684 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:688 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_or_b32_sdwa v3, v55, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:672 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v55, off, s[8:11], 0 offset:668 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX906-NEXT:    v_or_b32_sdwa v0, v54, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    v_or_b32_sdwa v4, v56, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:660 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v54, off, s[8:11], 0 offset:680 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    v_or_b32_sdwa v5, v57, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:676 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    v_or_b32_sdwa v2, v54, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v54, off, s[8:11], 0 offset:664 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:652 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v54, 8, v54
-; GFX906-NEXT:    v_or_b32_sdwa v54, v55, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v54, off, s[8:11], 0 offset:656 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_or_b32_sdwa v0, v54, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v4, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:644 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v54, off, s[8:11], 0 offset:648 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_or_b32_sdwa v0, v54, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    global_store_dwordx4 v1, v[2:5], s[2:3] offset:32
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:636 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:640 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_or_b32_sdwa v3, v51, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:624 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v51, off, s[8:11], 0 offset:620 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX906-NEXT:    v_or_b32_sdwa v0, v50, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    v_or_b32_sdwa v4, v52, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:612 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v50, off, s[8:11], 0 offset:632 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    v_or_b32_sdwa v5, v53, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:628 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    v_or_b32_sdwa v2, v50, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v50, off, s[8:11], 0 offset:616 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:604 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v50, 8, v50
-; GFX906-NEXT:    v_or_b32_sdwa v50, v51, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v50, off, s[8:11], 0 offset:608 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
+; GFX906-NEXT:    buffer_store_dword v1, off, s[8:11], 0 offset:520 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:524 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:528 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:532 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v5, off, s[8:11], 0 offset:536 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v6, off, s[8:11], 0 offset:540 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v7, off, s[8:11], 0 offset:544 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v8, off, s[8:11], 0 offset:548 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:552 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v10, off, s[8:11], 0 offset:556 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v11, off, s[8:11], 0 offset:560 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v12, off, s[8:11], 0 offset:564 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v13, off, s[8:11], 0 offset:568 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v14, off, s[8:11], 0 offset:572 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v15, off, s[8:11], 0 offset:576 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v16, off, s[8:11], 0 offset:580 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:584 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v18, off, s[8:11], 0 offset:588 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v19, off, s[8:11], 0 offset:592 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v20, off, s[8:11], 0 offset:596 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v21, off, s[8:11], 0 offset:600 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v22, off, s[8:11], 0 offset:604 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v23, off, s[8:11], 0 offset:608 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v24, off, s[8:11], 0 offset:612 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v25, off, s[8:11], 0 offset:616 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v26, off, s[8:11], 0 offset:620 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v27, off, s[8:11], 0 offset:624 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v28, off, s[8:11], 0 offset:628 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v29, off, s[8:11], 0 offset:632 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v30, off, s[8:11], 0 offset:636 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v31, off, s[8:11], 0 offset:640 ; 4-byte Folded Spill
+; GFX906-NEXT:    global_load_dwordx4 v[4:7], v2, s[4:5] offset:144
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_or_b32_sdwa v0, v50, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v4, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:596 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v50, off, s[8:11], 0 offset:600 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:644 ; 4-byte Folded Spill
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_or_b32_sdwa v0, v50, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    global_store_dwordx4 v1, v[2:5], s[2:3] offset:48
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:588 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_store_dword v1, off, s[8:11], 0 offset:648 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:652 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:656 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:660 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v5, off, s[8:11], 0 offset:664 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v6, off, s[8:11], 0 offset:668 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v7, off, s[8:11], 0 offset:672 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v8, off, s[8:11], 0 offset:676 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:680 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v10, off, s[8:11], 0 offset:684 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v11, off, s[8:11], 0 offset:688 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v12, off, s[8:11], 0 offset:692 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v13, off, s[8:11], 0 offset:696 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v14, off, s[8:11], 0 offset:700 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v15, off, s[8:11], 0 offset:704 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v16, off, s[8:11], 0 offset:708 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:712 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v18, off, s[8:11], 0 offset:716 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v19, off, s[8:11], 0 offset:720 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v20, off, s[8:11], 0 offset:724 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v21, off, s[8:11], 0 offset:728 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v22, off, s[8:11], 0 offset:732 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v23, off, s[8:11], 0 offset:736 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v24, off, s[8:11], 0 offset:740 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v25, off, s[8:11], 0 offset:744 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v26, off, s[8:11], 0 offset:748 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v27, off, s[8:11], 0 offset:752 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v28, off, s[8:11], 0 offset:756 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v29, off, s[8:11], 0 offset:760 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v30, off, s[8:11], 0 offset:764 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v31, off, s[8:11], 0 offset:768 ; 4-byte Folded Spill
+; GFX906-NEXT:    global_load_dwordx4 v[3:6], v2, s[4:5] offset:128
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:592 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_or_b32_sdwa v3, v47, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:576 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v47, off, s[8:11], 0 offset:572 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX906-NEXT:    v_or_b32_sdwa v0, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    v_or_b32_sdwa v4, v48, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:564 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v46, off, s[8:11], 0 offset:584 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    v_or_b32_sdwa v5, v49, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:580 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:132 ; 4-byte Folded Spill
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    v_or_b32_sdwa v2, v46, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v46, off, s[8:11], 0 offset:568 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:556 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v46, 8, v46
-; GFX906-NEXT:    v_or_b32_sdwa v46, v47, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v46, off, s[8:11], 0 offset:560 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
+; GFX906-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:136 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v5, off, s[8:11], 0 offset:140 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v6, off, s[8:11], 0 offset:144 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v7, off, s[8:11], 0 offset:148 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v8, off, s[8:11], 0 offset:152 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:156 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v10, off, s[8:11], 0 offset:160 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v11, off, s[8:11], 0 offset:164 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v12, off, s[8:11], 0 offset:168 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v13, off, s[8:11], 0 offset:172 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v14, off, s[8:11], 0 offset:176 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v15, off, s[8:11], 0 offset:180 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v16, off, s[8:11], 0 offset:184 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:188 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v18, off, s[8:11], 0 offset:192 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v19, off, s[8:11], 0 offset:196 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v20, off, s[8:11], 0 offset:200 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v21, off, s[8:11], 0 offset:204 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v22, off, s[8:11], 0 offset:208 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v23, off, s[8:11], 0 offset:212 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v24, off, s[8:11], 0 offset:216 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v25, off, s[8:11], 0 offset:220 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v26, off, s[8:11], 0 offset:224 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v27, off, s[8:11], 0 offset:228 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v28, off, s[8:11], 0 offset:232 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v29, off, s[8:11], 0 offset:236 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v30, off, s[8:11], 0 offset:240 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v31, off, s[8:11], 0 offset:244 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v32, off, s[8:11], 0 offset:248 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v33, off, s[8:11], 0 offset:252 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v34, off, s[8:11], 0 offset:256 ; 4-byte Folded Spill
+; GFX906-NEXT:    global_load_dwordx4 v[46:49], v2, s[4:5] offset:112
+; GFX906-NEXT:    global_load_dwordx4 v[50:53], v2, s[4:5] offset:96
+; GFX906-NEXT:    s_nop 0
+; GFX906-NEXT:    global_load_dwordx4 v[20:23], v2, s[4:5] offset:80
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_or_b32_sdwa v0, v46, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v4, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:548 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v46, off, s[8:11], 0 offset:552 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:260 ; 4-byte Folded Spill
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_or_b32_sdwa v0, v46, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    global_store_dwordx4 v1, v[2:5], s[2:3] offset:64
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:540 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_store_dword v1, off, s[8:11], 0 offset:264 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:268 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:272 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:276 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v5, off, s[8:11], 0 offset:280 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v6, off, s[8:11], 0 offset:284 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v7, off, s[8:11], 0 offset:288 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v8, off, s[8:11], 0 offset:292 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:296 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v10, off, s[8:11], 0 offset:300 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v11, off, s[8:11], 0 offset:304 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v12, off, s[8:11], 0 offset:308 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v13, off, s[8:11], 0 offset:312 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v14, off, s[8:11], 0 offset:316 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v15, off, s[8:11], 0 offset:320 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v16, off, s[8:11], 0 offset:324 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:328 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v18, off, s[8:11], 0 offset:332 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v19, off, s[8:11], 0 offset:336 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v20, off, s[8:11], 0 offset:340 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v21, off, s[8:11], 0 offset:344 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v22, off, s[8:11], 0 offset:348 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v23, off, s[8:11], 0 offset:352 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v24, off, s[8:11], 0 offset:356 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v25, off, s[8:11], 0 offset:360 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v26, off, s[8:11], 0 offset:364 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v27, off, s[8:11], 0 offset:368 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v28, off, s[8:11], 0 offset:372 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v29, off, s[8:11], 0 offset:376 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v30, off, s[8:11], 0 offset:380 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v31, off, s[8:11], 0 offset:384 ; 4-byte Folded Spill
+; GFX906-NEXT:    global_load_dwordx4 v[16:19], v2, s[4:5] offset:64
+; GFX906-NEXT:    s_nop 0
+; GFX906-NEXT:    global_load_dwordx4 v[12:15], v2, s[4:5] offset:48
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:544 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_or_b32_sdwa v3, v43, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:528 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v43, off, s[8:11], 0 offset:524 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX906-NEXT:    v_or_b32_sdwa v0, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    v_or_b32_sdwa v4, v44, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:516 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v42, off, s[8:11], 0 offset:536 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    v_or_b32_sdwa v5, v45, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:532 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:772 ; 4-byte Folded Spill
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    v_or_b32_sdwa v2, v42, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v42, off, s[8:11], 0 offset:520 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:508 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v42, 8, v42
-; GFX906-NEXT:    v_or_b32_sdwa v42, v43, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v42, off, s[8:11], 0 offset:512 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
+; GFX906-NEXT:    buffer_store_dword v1, off, s[8:11], 0 offset:776 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:780 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:784 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:788 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v5, off, s[8:11], 0 offset:792 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v6, off, s[8:11], 0 offset:796 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v7, off, s[8:11], 0 offset:800 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v8, off, s[8:11], 0 offset:804 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:808 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v10, off, s[8:11], 0 offset:812 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v11, off, s[8:11], 0 offset:816 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v12, off, s[8:11], 0 offset:820 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v13, off, s[8:11], 0 offset:824 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v14, off, s[8:11], 0 offset:828 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v15, off, s[8:11], 0 offset:832 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v16, off, s[8:11], 0 offset:836 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:840 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v18, off, s[8:11], 0 offset:844 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v19, off, s[8:11], 0 offset:848 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v20, off, s[8:11], 0 offset:852 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v21, off, s[8:11], 0 offset:856 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v22, off, s[8:11], 0 offset:860 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v23, off, s[8:11], 0 offset:864 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v24, off, s[8:11], 0 offset:868 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v25, off, s[8:11], 0 offset:872 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v26, off, s[8:11], 0 offset:876 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v27, off, s[8:11], 0 offset:880 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v28, off, s[8:11], 0 offset:884 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v29, off, s[8:11], 0 offset:888 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v30, off, s[8:11], 0 offset:892 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v31, off, s[8:11], 0 offset:896 ; 4-byte Folded Spill
+; GFX906-NEXT:    global_load_dwordx4 v[8:11], v2, s[4:5] offset:32
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:900 ; 4-byte Folded Spill
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    buffer_store_dword v1, off, s[8:11], 0 offset:904 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:908 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:912 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:916 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v5, off, s[8:11], 0 offset:920 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v6, off, s[8:11], 0 offset:924 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v7, off, s[8:11], 0 offset:928 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v8, off, s[8:11], 0 offset:932 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:936 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v10, off, s[8:11], 0 offset:940 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v11, off, s[8:11], 0 offset:944 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v12, off, s[8:11], 0 offset:948 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v13, off, s[8:11], 0 offset:952 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v14, off, s[8:11], 0 offset:956 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v15, off, s[8:11], 0 offset:960 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v16, off, s[8:11], 0 offset:964 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:968 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v18, off, s[8:11], 0 offset:972 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v19, off, s[8:11], 0 offset:976 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v20, off, s[8:11], 0 offset:980 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v21, off, s[8:11], 0 offset:984 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v22, off, s[8:11], 0 offset:988 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v23, off, s[8:11], 0 offset:992 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v24, off, s[8:11], 0 offset:996 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v25, off, s[8:11], 0 offset:1000 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v26, off, s[8:11], 0 offset:1004 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v27, off, s[8:11], 0 offset:1008 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v28, off, s[8:11], 0 offset:1012 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v29, off, s[8:11], 0 offset:1016 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v30, off, s[8:11], 0 offset:1020 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v31, off, s[8:11], 0 offset:1024 ; 4-byte Folded Spill
+; GFX906-NEXT:    global_load_dwordx4 v[4:7], v2, s[4:5] offset:16
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:1028 ; 4-byte Folded Spill
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    buffer_store_dword v1, off, s[8:11], 0 offset:1032 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:1036 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:1040 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:1044 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v5, off, s[8:11], 0 offset:1048 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v6, off, s[8:11], 0 offset:1052 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v7, off, s[8:11], 0 offset:1056 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v8, off, s[8:11], 0 offset:1060 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:1064 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v10, off, s[8:11], 0 offset:1068 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v11, off, s[8:11], 0 offset:1072 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v12, off, s[8:11], 0 offset:1076 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v13, off, s[8:11], 0 offset:1080 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v14, off, s[8:11], 0 offset:1084 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v15, off, s[8:11], 0 offset:1088 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v16, off, s[8:11], 0 offset:1092 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:1096 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v18, off, s[8:11], 0 offset:1100 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v19, off, s[8:11], 0 offset:1104 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v20, off, s[8:11], 0 offset:1108 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v21, off, s[8:11], 0 offset:1112 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v22, off, s[8:11], 0 offset:1116 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v23, off, s[8:11], 0 offset:1120 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v24, off, s[8:11], 0 offset:1124 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v25, off, s[8:11], 0 offset:1128 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v26, off, s[8:11], 0 offset:1132 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v27, off, s[8:11], 0 offset:1136 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v28, off, s[8:11], 0 offset:1140 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v29, off, s[8:11], 0 offset:1144 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v30, off, s[8:11], 0 offset:1148 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v31, off, s[8:11], 0 offset:1152 ; 4-byte Folded Spill
+; GFX906-NEXT:    global_load_dwordx4 v[2:5], v2, s[4:5]
+; GFX906-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX906-NEXT:    s_cbranch_execz .LBB6_2
+; GFX906-NEXT:  ; %bb.1: ; %bb.1
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_or_b32_sdwa v0, v42, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v4, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:500 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v42, off, s[8:11], 0 offset:504 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
+; GFX906-NEXT:    v_lshlrev_b64 v[2:3], 3, v[0:1]
+; GFX906-NEXT:    v_mov_b32_e32 v0, s7
+; GFX906-NEXT:    v_add_co_u32_e32 v2, vcc, s6, v2
+; GFX906-NEXT:    v_addc_co_u32_e32 v3, vcc, v0, v3, vcc
+; GFX906-NEXT:    global_load_dwordx4 v[54:57], v[2:3], off offset:240
+; GFX906-NEXT:    global_load_dwordx4 v[38:41], v[2:3], off offset:224
+; GFX906-NEXT:    global_load_dwordx4 v[20:23], v[2:3], off offset:208
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_or_b32_sdwa v0, v42, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    global_store_dwordx4 v1, v[2:5], s[2:3] offset:80
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:492 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:388 ; 4-byte Folded Spill
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:496 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_or_b32_sdwa v3, v39, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:480 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v39, off, s[8:11], 0 offset:476 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX906-NEXT:    v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    v_or_b32_sdwa v4, v40, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:468 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v38, off, s[8:11], 0 offset:488 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    v_or_b32_sdwa v5, v41, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:484 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_store_dword v1, off, s[8:11], 0 offset:392 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:396 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:400 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:404 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v5, off, s[8:11], 0 offset:408 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v6, off, s[8:11], 0 offset:412 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v7, off, s[8:11], 0 offset:416 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v8, off, s[8:11], 0 offset:420 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:424 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v10, off, s[8:11], 0 offset:428 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v11, off, s[8:11], 0 offset:432 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v12, off, s[8:11], 0 offset:436 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v13, off, s[8:11], 0 offset:440 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v14, off, s[8:11], 0 offset:444 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v15, off, s[8:11], 0 offset:448 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v16, off, s[8:11], 0 offset:452 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:456 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v18, off, s[8:11], 0 offset:460 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v19, off, s[8:11], 0 offset:464 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v20, off, s[8:11], 0 offset:468 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v21, off, s[8:11], 0 offset:472 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v22, off, s[8:11], 0 offset:476 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v23, off, s[8:11], 0 offset:480 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v24, off, s[8:11], 0 offset:484 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v25, off, s[8:11], 0 offset:488 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v26, off, s[8:11], 0 offset:492 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v27, off, s[8:11], 0 offset:496 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v28, off, s[8:11], 0 offset:500 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v29, off, s[8:11], 0 offset:504 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v30, off, s[8:11], 0 offset:508 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v31, off, s[8:11], 0 offset:512 ; 4-byte Folded Spill
+; GFX906-NEXT:    global_load_dwordx4 v[42:45], v[2:3], off offset:192
+; GFX906-NEXT:    s_nop 0
+; GFX906-NEXT:    global_load_dwordx4 v[12:15], v[2:3], off offset:176
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    buffer_store_dword v1, off, s[8:11], 0 offset:8 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:12 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:16 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:20 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v5, off, s[8:11], 0 offset:24 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v6, off, s[8:11], 0 offset:28 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v7, off, s[8:11], 0 offset:32 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v8, off, s[8:11], 0 offset:36 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:40 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v10, off, s[8:11], 0 offset:44 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v11, off, s[8:11], 0 offset:48 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v12, off, s[8:11], 0 offset:52 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v13, off, s[8:11], 0 offset:56 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v14, off, s[8:11], 0 offset:60 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v15, off, s[8:11], 0 offset:64 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v16, off, s[8:11], 0 offset:68 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:72 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v18, off, s[8:11], 0 offset:76 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v19, off, s[8:11], 0 offset:80 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v20, off, s[8:11], 0 offset:84 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v21, off, s[8:11], 0 offset:88 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v22, off, s[8:11], 0 offset:92 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v23, off, s[8:11], 0 offset:96 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v24, off, s[8:11], 0 offset:100 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v25, off, s[8:11], 0 offset:104 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v26, off, s[8:11], 0 offset:108 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v27, off, s[8:11], 0 offset:112 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v28, off, s[8:11], 0 offset:116 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v29, off, s[8:11], 0 offset:120 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v30, off, s[8:11], 0 offset:124 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v31, off, s[8:11], 0 offset:128 ; 4-byte Folded Spill
+; GFX906-NEXT:    global_load_dwordx4 v[8:11], v[2:3], off offset:160
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v38, off, s[8:11], 0 offset:472 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:460 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v38, 8, v38
-; GFX906-NEXT:    v_or_b32_sdwa v38, v39, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v38, off, s[8:11], 0 offset:464 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:516 ; 4-byte Folded Spill
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_or_b32_sdwa v0, v38, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v4, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:452 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v38, off, s[8:11], 0 offset:456 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
+; GFX906-NEXT:    buffer_store_dword v1, off, s[8:11], 0 offset:520 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:524 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:528 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:532 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v5, off, s[8:11], 0 offset:536 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v6, off, s[8:11], 0 offset:540 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v7, off, s[8:11], 0 offset:544 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v8, off, s[8:11], 0 offset:548 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:552 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v10, off, s[8:11], 0 offset:556 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v11, off, s[8:11], 0 offset:560 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v12, off, s[8:11], 0 offset:564 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v13, off, s[8:11], 0 offset:568 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v14, off, s[8:11], 0 offset:572 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v15, off, s[8:11], 0 offset:576 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v16, off, s[8:11], 0 offset:580 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:584 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v18, off, s[8:11], 0 offset:588 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v19, off, s[8:11], 0 offset:592 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v20, off, s[8:11], 0 offset:596 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v21, off, s[8:11], 0 offset:600 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v22, off, s[8:11], 0 offset:604 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v23, off, s[8:11], 0 offset:608 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v24, off, s[8:11], 0 offset:612 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v25, off, s[8:11], 0 offset:616 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v26, off, s[8:11], 0 offset:620 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v27, off, s[8:11], 0 offset:624 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v28, off, s[8:11], 0 offset:628 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v29, off, s[8:11], 0 offset:632 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v30, off, s[8:11], 0 offset:636 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v31, off, s[8:11], 0 offset:640 ; 4-byte Folded Spill
+; GFX906-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off offset:144
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_or_b32_sdwa v0, v38, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    global_store_dwordx4 v1, v[2:5], s[2:3] offset:96
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:444 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:644 ; 4-byte Folded Spill
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:448 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_or_b32_sdwa v3, v35, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:432 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v35, off, s[8:11], 0 offset:428 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX906-NEXT:    v_or_b32_sdwa v0, v34, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    v_or_b32_sdwa v4, v36, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:420 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v34, off, s[8:11], 0 offset:440 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    v_or_b32_sdwa v5, v37, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:436 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_store_dword v1, off, s[8:11], 0 offset:648 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:652 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:656 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:660 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v5, off, s[8:11], 0 offset:664 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v6, off, s[8:11], 0 offset:668 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v7, off, s[8:11], 0 offset:672 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v8, off, s[8:11], 0 offset:676 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:680 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v10, off, s[8:11], 0 offset:684 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v11, off, s[8:11], 0 offset:688 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v12, off, s[8:11], 0 offset:692 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v13, off, s[8:11], 0 offset:696 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v14, off, s[8:11], 0 offset:700 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v15, off, s[8:11], 0 offset:704 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v16, off, s[8:11], 0 offset:708 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:712 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v18, off, s[8:11], 0 offset:716 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v19, off, s[8:11], 0 offset:720 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v20, off, s[8:11], 0 offset:724 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v21, off, s[8:11], 0 offset:728 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v22, off, s[8:11], 0 offset:732 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v23, off, s[8:11], 0 offset:736 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v24, off, s[8:11], 0 offset:740 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v25, off, s[8:11], 0 offset:744 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v26, off, s[8:11], 0 offset:748 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v27, off, s[8:11], 0 offset:752 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v28, off, s[8:11], 0 offset:756 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v29, off, s[8:11], 0 offset:760 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v30, off, s[8:11], 0 offset:764 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v31, off, s[8:11], 0 offset:768 ; 4-byte Folded Spill
+; GFX906-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off offset:128
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:132 ; 4-byte Folded Spill
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    buffer_store_dword v5, off, s[8:11], 0 offset:136 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v6, off, s[8:11], 0 offset:140 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v7, off, s[8:11], 0 offset:144 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v8, off, s[8:11], 0 offset:148 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:152 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v10, off, s[8:11], 0 offset:156 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v11, off, s[8:11], 0 offset:160 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v12, off, s[8:11], 0 offset:164 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v13, off, s[8:11], 0 offset:168 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v14, off, s[8:11], 0 offset:172 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v15, off, s[8:11], 0 offset:176 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v16, off, s[8:11], 0 offset:180 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:184 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v18, off, s[8:11], 0 offset:188 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v19, off, s[8:11], 0 offset:192 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v20, off, s[8:11], 0 offset:196 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v21, off, s[8:11], 0 offset:200 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v22, off, s[8:11], 0 offset:204 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v23, off, s[8:11], 0 offset:208 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v24, off, s[8:11], 0 offset:212 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v25, off, s[8:11], 0 offset:216 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v26, off, s[8:11], 0 offset:220 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v27, off, s[8:11], 0 offset:224 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v28, off, s[8:11], 0 offset:228 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v29, off, s[8:11], 0 offset:232 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v30, off, s[8:11], 0 offset:236 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v31, off, s[8:11], 0 offset:240 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v32, off, s[8:11], 0 offset:244 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v33, off, s[8:11], 0 offset:248 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v34, off, s[8:11], 0 offset:252 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v35, off, s[8:11], 0 offset:256 ; 4-byte Folded Spill
+; GFX906-NEXT:    global_load_dwordx4 v[46:49], v[2:3], off offset:112
+; GFX906-NEXT:    global_load_dwordx4 v[50:53], v[2:3], off offset:96
+; GFX906-NEXT:    s_nop 0
+; GFX906-NEXT:    global_load_dwordx4 v[20:23], v[2:3], off offset:80
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    v_or_b32_sdwa v2, v34, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v34, off, s[8:11], 0 offset:424 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:412 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v34, 8, v34
-; GFX906-NEXT:    v_or_b32_sdwa v34, v35, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v34, off, s[8:11], 0 offset:416 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:260 ; 4-byte Folded Spill
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_or_b32_sdwa v0, v34, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v4, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:404 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v34, off, s[8:11], 0 offset:408 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
+; GFX906-NEXT:    buffer_store_dword v1, off, s[8:11], 0 offset:264 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:268 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:272 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:276 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v5, off, s[8:11], 0 offset:280 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v6, off, s[8:11], 0 offset:284 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v7, off, s[8:11], 0 offset:288 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v8, off, s[8:11], 0 offset:292 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:296 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v10, off, s[8:11], 0 offset:300 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v11, off, s[8:11], 0 offset:304 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v12, off, s[8:11], 0 offset:308 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v13, off, s[8:11], 0 offset:312 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v14, off, s[8:11], 0 offset:316 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v15, off, s[8:11], 0 offset:320 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v16, off, s[8:11], 0 offset:324 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:328 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v18, off, s[8:11], 0 offset:332 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v19, off, s[8:11], 0 offset:336 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v20, off, s[8:11], 0 offset:340 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v21, off, s[8:11], 0 offset:344 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v22, off, s[8:11], 0 offset:348 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v23, off, s[8:11], 0 offset:352 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v24, off, s[8:11], 0 offset:356 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v25, off, s[8:11], 0 offset:360 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v26, off, s[8:11], 0 offset:364 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v27, off, s[8:11], 0 offset:368 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v28, off, s[8:11], 0 offset:372 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v29, off, s[8:11], 0 offset:376 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v30, off, s[8:11], 0 offset:380 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v31, off, s[8:11], 0 offset:384 ; 4-byte Folded Spill
+; GFX906-NEXT:    global_load_dwordx4 v[16:19], v[2:3], off offset:64
+; GFX906-NEXT:    s_nop 0
+; GFX906-NEXT:    global_load_dwordx4 v[12:15], v[2:3], off offset:48
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_or_b32_sdwa v0, v34, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    global_store_dwordx4 v1, v[2:5], s[2:3] offset:112
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:396 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:772 ; 4-byte Folded Spill
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:400 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_or_b32_sdwa v3, v31, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:384 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v31, off, s[8:11], 0 offset:380 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX906-NEXT:    v_or_b32_sdwa v0, v30, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    v_or_b32_sdwa v4, v32, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:372 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v30, off, s[8:11], 0 offset:392 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    v_or_b32_sdwa v5, v33, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_store_dword v1, off, s[8:11], 0 offset:776 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:780 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:784 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:788 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v5, off, s[8:11], 0 offset:792 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v6, off, s[8:11], 0 offset:796 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v7, off, s[8:11], 0 offset:800 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v8, off, s[8:11], 0 offset:804 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:808 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v10, off, s[8:11], 0 offset:812 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v11, off, s[8:11], 0 offset:816 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v12, off, s[8:11], 0 offset:820 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v13, off, s[8:11], 0 offset:824 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v14, off, s[8:11], 0 offset:828 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v15, off, s[8:11], 0 offset:832 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v16, off, s[8:11], 0 offset:836 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:840 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v18, off, s[8:11], 0 offset:844 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v19, off, s[8:11], 0 offset:848 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v20, off, s[8:11], 0 offset:852 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v21, off, s[8:11], 0 offset:856 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v22, off, s[8:11], 0 offset:860 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v23, off, s[8:11], 0 offset:864 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v24, off, s[8:11], 0 offset:868 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v25, off, s[8:11], 0 offset:872 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v26, off, s[8:11], 0 offset:876 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v27, off, s[8:11], 0 offset:880 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v28, off, s[8:11], 0 offset:884 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v29, off, s[8:11], 0 offset:888 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v30, off, s[8:11], 0 offset:892 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v31, off, s[8:11], 0 offset:896 ; 4-byte Folded Spill
+; GFX906-NEXT:    global_load_dwordx4 v[8:11], v[2:3], off offset:32
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:900 ; 4-byte Folded Spill
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    buffer_store_dword v1, off, s[8:11], 0 offset:904 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:908 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:912 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:916 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v5, off, s[8:11], 0 offset:920 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v6, off, s[8:11], 0 offset:924 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v7, off, s[8:11], 0 offset:928 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v8, off, s[8:11], 0 offset:932 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:936 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v10, off, s[8:11], 0 offset:940 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v11, off, s[8:11], 0 offset:944 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v12, off, s[8:11], 0 offset:948 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v13, off, s[8:11], 0 offset:952 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v14, off, s[8:11], 0 offset:956 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v15, off, s[8:11], 0 offset:960 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v16, off, s[8:11], 0 offset:964 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:968 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v18, off, s[8:11], 0 offset:972 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v19, off, s[8:11], 0 offset:976 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v20, off, s[8:11], 0 offset:980 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v21, off, s[8:11], 0 offset:984 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v22, off, s[8:11], 0 offset:988 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v23, off, s[8:11], 0 offset:992 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v24, off, s[8:11], 0 offset:996 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v25, off, s[8:11], 0 offset:1000 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v26, off, s[8:11], 0 offset:1004 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v27, off, s[8:11], 0 offset:1008 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v28, off, s[8:11], 0 offset:1012 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v29, off, s[8:11], 0 offset:1016 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v30, off, s[8:11], 0 offset:1020 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v31, off, s[8:11], 0 offset:1024 ; 4-byte Folded Spill
+; GFX906-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off offset:16
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:1028 ; 4-byte Folded Spill
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    buffer_store_dword v1, off, s[8:11], 0 offset:1032 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:1036 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:1040 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:1044 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v5, off, s[8:11], 0 offset:1048 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v6, off, s[8:11], 0 offset:1052 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v7, off, s[8:11], 0 offset:1056 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v8, off, s[8:11], 0 offset:1060 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:1064 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v10, off, s[8:11], 0 offset:1068 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v11, off, s[8:11], 0 offset:1072 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v12, off, s[8:11], 0 offset:1076 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v13, off, s[8:11], 0 offset:1080 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v14, off, s[8:11], 0 offset:1084 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v15, off, s[8:11], 0 offset:1088 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v16, off, s[8:11], 0 offset:1092 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:1096 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v18, off, s[8:11], 0 offset:1100 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v19, off, s[8:11], 0 offset:1104 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v20, off, s[8:11], 0 offset:1108 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v21, off, s[8:11], 0 offset:1112 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v22, off, s[8:11], 0 offset:1116 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v23, off, s[8:11], 0 offset:1120 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v24, off, s[8:11], 0 offset:1124 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v25, off, s[8:11], 0 offset:1128 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v26, off, s[8:11], 0 offset:1132 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v27, off, s[8:11], 0 offset:1136 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v28, off, s[8:11], 0 offset:1140 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v29, off, s[8:11], 0 offset:1144 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v30, off, s[8:11], 0 offset:1148 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v31, off, s[8:11], 0 offset:1152 ; 4-byte Folded Spill
+; GFX906-NEXT:    global_load_dwordx4 v[2:5], v[2:3], off
+; GFX906-NEXT:  .LBB6_2: ; %bb.2
+; GFX906-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX906-NEXT:    global_store_dwordx4 v1, v[46:49], s[2:3] offset:112
+; GFX906-NEXT:    global_store_dwordx4 v1, v[50:53], s[2:3] offset:96
+; GFX906-NEXT:    v_mov_b32_e32 v49, v19
+; GFX906-NEXT:    v_mov_b32_e32 v48, v18
+; GFX906-NEXT:    v_mov_b32_e32 v47, v17
+; GFX906-NEXT:    v_mov_b32_e32 v46, v16
+; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:260 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v7, off, s[8:11], 0 offset:264 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v8, off, s[8:11], 0 offset:268 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v9, off, s[8:11], 0 offset:272 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v10, off, s[8:11], 0 offset:276 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v11, off, s[8:11], 0 offset:280 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v12, off, s[8:11], 0 offset:284 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v13, off, s[8:11], 0 offset:288 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v14, off, s[8:11], 0 offset:292 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v15, off, s[8:11], 0 offset:296 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v16, off, s[8:11], 0 offset:300 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v17, off, s[8:11], 0 offset:304 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v18, off, s[8:11], 0 offset:308 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v19, off, s[8:11], 0 offset:312 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v20, off, s[8:11], 0 offset:316 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v21, off, s[8:11], 0 offset:320 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v22, off, s[8:11], 0 offset:324 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v23, off, s[8:11], 0 offset:328 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v24, off, s[8:11], 0 offset:332 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v25, off, s[8:11], 0 offset:336 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v26, off, s[8:11], 0 offset:340 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v27, off, s[8:11], 0 offset:344 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v28, off, s[8:11], 0 offset:348 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v29, off, s[8:11], 0 offset:352 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v30, off, s[8:11], 0 offset:356 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v31, off, s[8:11], 0 offset:360 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v32, off, s[8:11], 0 offset:364 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v33, off, s[8:11], 0 offset:368 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v34, off, s[8:11], 0 offset:372 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v35, off, s[8:11], 0 offset:376 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v36, off, s[8:11], 0 offset:380 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v37, off, s[8:11], 0 offset:384 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    global_store_dwordx4 v1, v[26:29], s[2:3] offset:80
+; GFX906-NEXT:    global_store_dwordx4 v1, v[46:49], s[2:3] offset:64
+; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:772 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_nop 0
+; GFX906-NEXT:    buffer_load_dword v7, off, s[8:11], 0 offset:776 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v8, off, s[8:11], 0 offset:780 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v9, off, s[8:11], 0 offset:784 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v10, off, s[8:11], 0 offset:788 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v11, off, s[8:11], 0 offset:792 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v12, off, s[8:11], 0 offset:796 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v13, off, s[8:11], 0 offset:800 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v14, off, s[8:11], 0 offset:804 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v15, off, s[8:11], 0 offset:808 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v16, off, s[8:11], 0 offset:812 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v17, off, s[8:11], 0 offset:816 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v18, off, s[8:11], 0 offset:820 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v19, off, s[8:11], 0 offset:824 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v20, off, s[8:11], 0 offset:828 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v21, off, s[8:11], 0 offset:832 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v22, off, s[8:11], 0 offset:836 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v23, off, s[8:11], 0 offset:840 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v24, off, s[8:11], 0 offset:844 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v25, off, s[8:11], 0 offset:848 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v26, off, s[8:11], 0 offset:852 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v27, off, s[8:11], 0 offset:856 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v28, off, s[8:11], 0 offset:860 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v29, off, s[8:11], 0 offset:864 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v30, off, s[8:11], 0 offset:868 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v31, off, s[8:11], 0 offset:872 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v32, off, s[8:11], 0 offset:876 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v33, off, s[8:11], 0 offset:880 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v34, off, s[8:11], 0 offset:884 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v35, off, s[8:11], 0 offset:888 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v36, off, s[8:11], 0 offset:892 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v37, off, s[8:11], 0 offset:896 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    global_store_dwordx4 v1, v[18:21], s[2:3] offset:48
+; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:900 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_nop 0
+; GFX906-NEXT:    buffer_load_dword v7, off, s[8:11], 0 offset:904 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v8, off, s[8:11], 0 offset:908 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v9, off, s[8:11], 0 offset:912 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v10, off, s[8:11], 0 offset:916 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v11, off, s[8:11], 0 offset:920 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v12, off, s[8:11], 0 offset:924 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v13, off, s[8:11], 0 offset:928 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v14, off, s[8:11], 0 offset:932 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v15, off, s[8:11], 0 offset:936 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v16, off, s[8:11], 0 offset:940 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v17, off, s[8:11], 0 offset:944 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v18, off, s[8:11], 0 offset:948 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v19, off, s[8:11], 0 offset:952 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v20, off, s[8:11], 0 offset:956 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v21, off, s[8:11], 0 offset:960 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v22, off, s[8:11], 0 offset:964 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v23, off, s[8:11], 0 offset:968 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v24, off, s[8:11], 0 offset:972 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v25, off, s[8:11], 0 offset:976 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v26, off, s[8:11], 0 offset:980 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v27, off, s[8:11], 0 offset:984 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v28, off, s[8:11], 0 offset:988 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v29, off, s[8:11], 0 offset:992 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v30, off, s[8:11], 0 offset:996 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v31, off, s[8:11], 0 offset:1000 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v32, off, s[8:11], 0 offset:1004 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v33, off, s[8:11], 0 offset:1008 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v34, off, s[8:11], 0 offset:1012 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v35, off, s[8:11], 0 offset:1016 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v36, off, s[8:11], 0 offset:1020 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v37, off, s[8:11], 0 offset:1024 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    global_store_dwordx4 v1, v[14:17], s[2:3] offset:32
+; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:1028 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_nop 0
+; GFX906-NEXT:    buffer_load_dword v7, off, s[8:11], 0 offset:1032 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v8, off, s[8:11], 0 offset:1036 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v9, off, s[8:11], 0 offset:1040 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v10, off, s[8:11], 0 offset:1044 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v11, off, s[8:11], 0 offset:1048 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v12, off, s[8:11], 0 offset:1052 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v13, off, s[8:11], 0 offset:1056 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v14, off, s[8:11], 0 offset:1060 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v15, off, s[8:11], 0 offset:1064 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v16, off, s[8:11], 0 offset:1068 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v17, off, s[8:11], 0 offset:1072 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v18, off, s[8:11], 0 offset:1076 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v19, off, s[8:11], 0 offset:1080 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v20, off, s[8:11], 0 offset:1084 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v21, off, s[8:11], 0 offset:1088 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v22, off, s[8:11], 0 offset:1092 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v23, off, s[8:11], 0 offset:1096 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v24, off, s[8:11], 0 offset:1100 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v25, off, s[8:11], 0 offset:1104 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v26, off, s[8:11], 0 offset:1108 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v27, off, s[8:11], 0 offset:1112 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v28, off, s[8:11], 0 offset:1116 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v29, off, s[8:11], 0 offset:1120 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v30, off, s[8:11], 0 offset:1124 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v31, off, s[8:11], 0 offset:1128 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v32, off, s[8:11], 0 offset:1132 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v33, off, s[8:11], 0 offset:1136 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v34, off, s[8:11], 0 offset:1140 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v35, off, s[8:11], 0 offset:1144 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v36, off, s[8:11], 0 offset:1148 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v37, off, s[8:11], 0 offset:1152 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    global_store_dwordx4 v1, v[10:13], s[2:3] offset:16
+; GFX906-NEXT:    global_store_dwordx4 v1, v[2:5], s[2:3]
+; GFX906-NEXT:    global_store_dwordx4 v1, v[54:57], s[2:3] offset:240
+; GFX906-NEXT:    global_store_dwordx4 v1, v[38:41], s[2:3] offset:224
 ; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:388 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    v_or_b32_sdwa v2, v30, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v30, off, s[8:11], 0 offset:376 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:364 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v30, 8, v30
-; GFX906-NEXT:    v_or_b32_sdwa v30, v31, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v30, off, s[8:11], 0 offset:368 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_or_b32_sdwa v0, v30, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v4, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:356 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v30, off, s[8:11], 0 offset:360 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_or_b32_sdwa v0, v30, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    global_store_dwordx4 v1, v[2:5], s[2:3] offset:128
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:348 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:352 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_or_b32_sdwa v3, v27, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:336 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v27, off, s[8:11], 0 offset:332 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX906-NEXT:    v_or_b32_sdwa v0, v26, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    v_or_b32_sdwa v4, v28, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:324 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v26, off, s[8:11], 0 offset:344 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    v_or_b32_sdwa v5, v29, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:340 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    v_or_b32_sdwa v2, v26, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v26, off, s[8:11], 0 offset:328 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:316 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v26, 8, v26
-; GFX906-NEXT:    v_or_b32_sdwa v26, v27, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v26, off, s[8:11], 0 offset:320 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_or_b32_sdwa v0, v26, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v4, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:308 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v26, off, s[8:11], 0 offset:312 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_or_b32_sdwa v0, v26, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    global_store_dwordx4 v1, v[2:5], s[2:3] offset:144
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:300 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:304 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_or_b32_sdwa v3, v23, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:288 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v23, off, s[8:11], 0 offset:284 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX906-NEXT:    v_or_b32_sdwa v0, v22, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    v_or_b32_sdwa v4, v24, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:276 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v22, off, s[8:11], 0 offset:296 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    v_or_b32_sdwa v5, v25, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:292 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    v_or_b32_sdwa v2, v22, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v22, off, s[8:11], 0 offset:280 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:268 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v22, 8, v22
-; GFX906-NEXT:    v_or_b32_sdwa v22, v23, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v22, off, s[8:11], 0 offset:272 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_or_b32_sdwa v0, v22, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v4, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:260 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v22, off, s[8:11], 0 offset:264 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_or_b32_sdwa v0, v22, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    global_store_dwordx4 v1, v[2:5], s[2:3] offset:160
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:252 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:256 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_or_b32_sdwa v3, v19, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:240 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v19, off, s[8:11], 0 offset:236 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX906-NEXT:    v_or_b32_sdwa v0, v18, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    v_or_b32_sdwa v4, v20, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:228 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v18, off, s[8:11], 0 offset:248 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    v_or_b32_sdwa v5, v21, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:244 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    v_or_b32_sdwa v2, v18, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v18, off, s[8:11], 0 offset:232 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:220 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v18, 8, v18
-; GFX906-NEXT:    v_or_b32_sdwa v18, v19, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v18, off, s[8:11], 0 offset:224 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_or_b32_sdwa v0, v18, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v4, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:212 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v18, off, s[8:11], 0 offset:216 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_or_b32_sdwa v0, v18, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    global_store_dwordx4 v1, v[2:5], s[2:3] offset:176
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:204 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:208 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_or_b32_sdwa v3, v15, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:192 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v15, off, s[8:11], 0 offset:188 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX906-NEXT:    v_or_b32_sdwa v0, v14, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    v_or_b32_sdwa v4, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:180 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v14, off, s[8:11], 0 offset:200 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    v_or_b32_sdwa v5, v17, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:196 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    v_or_b32_sdwa v2, v14, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v14, off, s[8:11], 0 offset:184 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:172 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v14, 8, v14
-; GFX906-NEXT:    v_or_b32_sdwa v14, v15, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v14, off, s[8:11], 0 offset:176 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_or_b32_sdwa v0, v14, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v4, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:164 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v14, off, s[8:11], 0 offset:168 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_or_b32_sdwa v0, v14, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    global_store_dwordx4 v1, v[2:5], s[2:3] offset:192
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:160 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_nop 0
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:156 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:152 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v4, off, s[8:11], 0 offset:144 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:132 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(4)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX906-NEXT:    v_or_b32_sdwa v0, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v10, off, s[8:11], 0 offset:120 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:148 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:140 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
-; GFX906-NEXT:    v_or_b32_sdwa v0, v11, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:136 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v4, off, s[8:11], 0 offset:128 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v4, 8, v4
-; GFX906-NEXT:    v_or_b32_sdwa v0, v12, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:124 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:116 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v5, 8, v5
-; GFX906-NEXT:    v_or_b32_sdwa v0, v13, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v5, v10, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v5, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    global_store_dwordx4 v1, v[2:5], s[2:3] offset:208
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:112 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:392 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v4, off, s[8:11], 0 offset:396 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:400 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:404 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v7, off, s[8:11], 0 offset:408 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v8, off, s[8:11], 0 offset:412 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v9, off, s[8:11], 0 offset:416 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v10, off, s[8:11], 0 offset:420 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v11, off, s[8:11], 0 offset:424 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v12, off, s[8:11], 0 offset:428 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v13, off, s[8:11], 0 offset:432 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v14, off, s[8:11], 0 offset:436 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v15, off, s[8:11], 0 offset:440 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v16, off, s[8:11], 0 offset:444 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v17, off, s[8:11], 0 offset:448 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v18, off, s[8:11], 0 offset:452 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v19, off, s[8:11], 0 offset:456 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v20, off, s[8:11], 0 offset:460 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v21, off, s[8:11], 0 offset:464 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v22, off, s[8:11], 0 offset:468 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v23, off, s[8:11], 0 offset:472 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v24, off, s[8:11], 0 offset:476 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v25, off, s[8:11], 0 offset:480 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v26, off, s[8:11], 0 offset:484 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v27, off, s[8:11], 0 offset:488 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v28, off, s[8:11], 0 offset:492 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v29, off, s[8:11], 0 offset:496 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v30, off, s[8:11], 0 offset:500 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v31, off, s[8:11], 0 offset:504 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v32, off, s[8:11], 0 offset:508 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v33, off, s[8:11], 0 offset:512 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    global_store_dwordx4 v1, v[22:25], s[2:3] offset:208
+; GFX906-NEXT:    global_store_dwordx4 v1, v[42:45], s[2:3] offset:192
+; GFX906-NEXT:    buffer_load_dword v16, off, s[8:11], 0 offset:4 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_nop 0
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:108 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:104 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v4, off, s[8:11], 0 offset:96 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:84 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(4)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX906-NEXT:    v_or_b32_sdwa v0, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:72 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:100 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:92 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
-; GFX906-NEXT:    v_or_b32_sdwa v0, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:88 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v4, off, s[8:11], 0 offset:80 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v4, 8, v4
-; GFX906-NEXT:    v_or_b32_sdwa v0, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:76 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:68 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v5, 8, v5
-; GFX906-NEXT:    v_or_b32_sdwa v0, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v5, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    global_store_dwordx4 v1, v[2:5], s[2:3] offset:224
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:64 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v17, off, s[8:11], 0 offset:8 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v18, off, s[8:11], 0 offset:12 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v19, off, s[8:11], 0 offset:16 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v20, off, s[8:11], 0 offset:20 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v21, off, s[8:11], 0 offset:24 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v22, off, s[8:11], 0 offset:28 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v23, off, s[8:11], 0 offset:32 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v24, off, s[8:11], 0 offset:36 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v25, off, s[8:11], 0 offset:40 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v26, off, s[8:11], 0 offset:44 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v27, off, s[8:11], 0 offset:48 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v28, off, s[8:11], 0 offset:52 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v29, off, s[8:11], 0 offset:56 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v30, off, s[8:11], 0 offset:60 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v31, off, s[8:11], 0 offset:64 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v32, off, s[8:11], 0 offset:68 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v33, off, s[8:11], 0 offset:72 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v34, off, s[8:11], 0 offset:76 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v35, off, s[8:11], 0 offset:80 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v36, off, s[8:11], 0 offset:84 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v37, off, s[8:11], 0 offset:88 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v38, off, s[8:11], 0 offset:92 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v39, off, s[8:11], 0 offset:96 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v40, off, s[8:11], 0 offset:100 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v41, off, s[8:11], 0 offset:104 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v42, off, s[8:11], 0 offset:108 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v43, off, s[8:11], 0 offset:112 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v44, off, s[8:11], 0 offset:116 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v45, off, s[8:11], 0 offset:120 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v46, off, s[8:11], 0 offset:124 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v47, off, s[8:11], 0 offset:128 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    global_store_dwordx4 v1, v[28:31], s[2:3] offset:176
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:516 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_nop 0
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:4 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:8 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v4, off, s[8:11], 0 offset:12 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:16 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:60 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(5)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:56 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    v_or_b32_sdwa v2, v6, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:52 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:48 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX906-NEXT:    v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:44 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
-; GFX906-NEXT:    v_or_b32_sdwa v3, v6, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:40 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:36 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX906-NEXT:    v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v4, off, s[8:11], 0 offset:32 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v4, 8, v4
-; GFX906-NEXT:    v_or_b32_sdwa v4, v6, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:28 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:24 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX906-NEXT:    v_or_b32_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:20 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:520 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v4, off, s[8:11], 0 offset:524 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:528 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:532 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v7, off, s[8:11], 0 offset:536 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v8, off, s[8:11], 0 offset:540 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v9, off, s[8:11], 0 offset:544 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v10, off, s[8:11], 0 offset:548 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v11, off, s[8:11], 0 offset:552 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v12, off, s[8:11], 0 offset:556 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v13, off, s[8:11], 0 offset:560 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v14, off, s[8:11], 0 offset:564 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v15, off, s[8:11], 0 offset:568 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v16, off, s[8:11], 0 offset:572 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v17, off, s[8:11], 0 offset:576 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v18, off, s[8:11], 0 offset:580 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v19, off, s[8:11], 0 offset:584 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v20, off, s[8:11], 0 offset:588 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v21, off, s[8:11], 0 offset:592 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v22, off, s[8:11], 0 offset:596 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v23, off, s[8:11], 0 offset:600 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v24, off, s[8:11], 0 offset:604 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v25, off, s[8:11], 0 offset:608 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v26, off, s[8:11], 0 offset:612 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v27, off, s[8:11], 0 offset:616 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v28, off, s[8:11], 0 offset:620 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v29, off, s[8:11], 0 offset:624 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v30, off, s[8:11], 0 offset:628 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v31, off, s[8:11], 0 offset:632 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v32, off, s[8:11], 0 offset:636 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v33, off, s[8:11], 0 offset:640 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    global_store_dwordx4 v1, v[10:13], s[2:3] offset:160
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:644 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_nop 0
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:648 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v4, off, s[8:11], 0 offset:652 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:656 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:660 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v7, off, s[8:11], 0 offset:664 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v8, off, s[8:11], 0 offset:668 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v9, off, s[8:11], 0 offset:672 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v10, off, s[8:11], 0 offset:676 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v11, off, s[8:11], 0 offset:680 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v12, off, s[8:11], 0 offset:684 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v13, off, s[8:11], 0 offset:688 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v14, off, s[8:11], 0 offset:692 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v15, off, s[8:11], 0 offset:696 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v16, off, s[8:11], 0 offset:700 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v17, off, s[8:11], 0 offset:704 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v18, off, s[8:11], 0 offset:708 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v19, off, s[8:11], 0 offset:712 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v20, off, s[8:11], 0 offset:716 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v21, off, s[8:11], 0 offset:720 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v22, off, s[8:11], 0 offset:724 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v23, off, s[8:11], 0 offset:728 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v24, off, s[8:11], 0 offset:732 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v25, off, s[8:11], 0 offset:736 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v26, off, s[8:11], 0 offset:740 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v27, off, s[8:11], 0 offset:744 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v28, off, s[8:11], 0 offset:748 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v29, off, s[8:11], 0 offset:752 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v30, off, s[8:11], 0 offset:756 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v31, off, s[8:11], 0 offset:760 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v32, off, s[8:11], 0 offset:764 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v33, off, s[8:11], 0 offset:768 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    global_store_dwordx4 v1, v[6:9], s[2:3] offset:144
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:132 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_nop 0
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:136 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v4, off, s[8:11], 0 offset:140 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:144 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:148 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v7, off, s[8:11], 0 offset:152 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v8, off, s[8:11], 0 offset:156 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v9, off, s[8:11], 0 offset:160 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v10, off, s[8:11], 0 offset:164 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v11, off, s[8:11], 0 offset:168 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v12, off, s[8:11], 0 offset:172 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v13, off, s[8:11], 0 offset:176 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v14, off, s[8:11], 0 offset:180 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v15, off, s[8:11], 0 offset:184 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v16, off, s[8:11], 0 offset:188 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v17, off, s[8:11], 0 offset:192 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v18, off, s[8:11], 0 offset:196 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v19, off, s[8:11], 0 offset:200 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v20, off, s[8:11], 0 offset:204 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v21, off, s[8:11], 0 offset:208 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v22, off, s[8:11], 0 offset:212 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v23, off, s[8:11], 0 offset:216 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v24, off, s[8:11], 0 offset:220 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v25, off, s[8:11], 0 offset:224 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v26, off, s[8:11], 0 offset:228 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v27, off, s[8:11], 0 offset:232 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v28, off, s[8:11], 0 offset:236 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v29, off, s[8:11], 0 offset:240 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v30, off, s[8:11], 0 offset:244 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v31, off, s[8:11], 0 offset:248 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v32, off, s[8:11], 0 offset:252 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v33, off, s[8:11], 0 offset:256 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v5, 8, v5
-; GFX906-NEXT:    v_or_b32_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v5, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    global_store_dwordx4 v1, v[2:5], s[2:3] offset:240
+; GFX906-NEXT:    global_store_dwordx4 v1, v[2:5], s[2:3] offset:128
 ; GFX906-NEXT:    s_endpgm
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()



More information about the llvm-commits mailing list