[llvm] 3e53c97 - Revert "[AMDGPU] Add IR LiveReg type-based optimization" (#97138)

via llvm-commits llvm-commits at lists.llvm.org
Fri Jun 28 23:18:30 PDT 2024


Author: Vitaly Buka
Date: 2024-06-28T23:18:26-07:00
New Revision: 3e53c97d33210db68188e731e93ee48dbaeeae32

URL: https://github.com/llvm/llvm-project/commit/3e53c97d33210db68188e731e93ee48dbaeeae32
DIFF: https://github.com/llvm/llvm-project/commit/3e53c97d33210db68188e731e93ee48dbaeeae32.diff

LOG: Revert "[AMDGPU] Add IR LiveReg type-based optimization" (#97138)

Part of #66838.

https://lab.llvm.org/buildbot/#/builders/52/builds/404
https://lab.llvm.org/buildbot/#/builders/55/builds/358
https://lab.llvm.org/buildbot/#/builders/164/builds/518

This reverts commit ded956440739ae326a99cbaef18ce4362e972679.

Added: 
    

Modified: 
    llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
    llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
    llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-break-large-phis.ll
    llvm/test/CodeGen/AMDGPU/dagcomb-extract-vec-elt-different-sizes.ll
    llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll
    llvm/test/CodeGen/AMDGPU/extract-subvector.ll
    llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
    llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll
    llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll

Removed: 
    llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll
    llvm/test/CodeGen/AMDGPU/vni8-live-reg-opt.ll


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
index 7623b73d6dd5f..69fdeaebe0a01 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
@@ -81,73 +81,6 @@ class AMDGPULateCodeGenPrepare
   bool visitLoadInst(LoadInst &LI);
 };
 
-using ValueToValueMap = DenseMap<const Value *, Value *>;
-
-class LiveRegOptimizer {
-private:
-  Module *Mod = nullptr;
-  const DataLayout *DL = nullptr;
-  const GCNSubtarget *ST;
-  /// The scalar type to convert to
-  Type *ConvertToScalar;
-  /// The set of visited Instructions
-  SmallPtrSet<Instruction *, 4> Visited;
-  /// The set of Instructions to be deleted
-  SmallPtrSet<Instruction *, 4> DeadInstrs;
-  /// Map of Value -> Converted Value
-  ValueToValueMap ValMap;
-  /// Map of containing conversions from Optimal Type -> Original Type per BB.
-  DenseMap<BasicBlock *, ValueToValueMap> BBUseValMap;
-
-public:
-  /// Calculate the and \p return  the type to convert to given a problematic \p
-  /// OriginalType. In some instances, we may widen the type (e.g. v2i8 -> i32).
-  Type *calculateConvertType(Type *OriginalType);
-  /// Convert the virtual register defined by \p V to the compatible vector of
-  /// legal type
-  Value *convertToOptType(Instruction *V, BasicBlock::iterator &InstPt);
-  /// Convert the virtual register defined by \p V back to the original type \p
-  /// ConvertType, stripping away the MSBs in cases where there was an imperfect
-  /// fit (e.g. v2i32 -> v7i8)
-  Value *convertFromOptType(Type *ConvertType, Instruction *V,
-                            BasicBlock::iterator &InstPt,
-                            BasicBlock *InsertBlock);
-  /// Check for problematic PHI nodes or cross-bb values based on the value
-  /// defined by \p I, and coerce to legal types if necessary. For problematic
-  /// PHI node, we coerce all incoming values in a single invocation.
-  bool optimizeLiveType(Instruction *I);
-
-  /// Remove all instructions that have become dead (i.e. all the re-typed PHIs)
-  void removeDeadInstrs();
-
-  // Whether or not the type should be replaced to avoid inefficient
-  // legalization code
-  bool shouldReplace(Type *ITy) {
-    FixedVectorType *VTy = dyn_cast<FixedVectorType>(ITy);
-    if (!VTy)
-      return false;
-
-    auto TLI = ST->getTargetLowering();
-
-    Type *EltTy = VTy->getElementType();
-    // If the element size is not less than the convert to scalar size, then we
-    // can't do any bit packing
-    if (!EltTy->isIntegerTy() ||
-        EltTy->getScalarSizeInBits() > ConvertToScalar->getScalarSizeInBits())
-      return false;
-
-    // Only coerce illegal types
-    TargetLoweringBase::LegalizeKind LK =
-        TLI->getTypeConversion(EltTy->getContext(), EVT::getEVT(EltTy, false));
-    return LK.first != TargetLoweringBase::TypeLegal;
-  }
-
-  LiveRegOptimizer(Module *Mod, const GCNSubtarget *ST) : Mod(Mod), ST(ST) {
-    DL = &Mod->getDataLayout();
-    ConvertToScalar = Type::getInt32Ty(Mod->getContext());
-  }
-};
-
 } // end anonymous namespace
 
 bool AMDGPULateCodeGenPrepare::doInitialization(Module &M) {
@@ -169,238 +102,14 @@ bool AMDGPULateCodeGenPrepare::runOnFunction(Function &F) {
   AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
   UA = &getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
 
-  // "Optimize" the virtual regs that cross basic block boundaries. When
-  // building the SelectionDAG, vectors of illegal types that cross basic blocks
-  // will be scalarized and widened, with each scalar living in its
-  // own register. To work around this, this optimization converts the
-  // vectors to equivalent vectors of legal type (which are converted back
-  // before uses in subsequent blocks), to pack the bits into fewer physical
-  // registers (used in CopyToReg/CopyFromReg pairs).
-  LiveRegOptimizer LRO(Mod, &ST);
-
   bool Changed = false;
-
   for (auto &BB : F)
-    for (Instruction &I : make_early_inc_range(BB)) {
+    for (Instruction &I : llvm::make_early_inc_range(BB))
       Changed |= visit(I);
-      Changed |= LRO.optimizeLiveType(&I);
-    }
 
-  LRO.removeDeadInstrs();
   return Changed;
 }
 
-Type *LiveRegOptimizer::calculateConvertType(Type *OriginalType) {
-  assert(OriginalType->getScalarSizeInBits() <=
-         ConvertToScalar->getScalarSizeInBits());
-
-  FixedVectorType *VTy = cast<FixedVectorType>(OriginalType);
-
-  TypeSize OriginalSize = DL->getTypeSizeInBits(VTy);
-  TypeSize ConvertScalarSize = DL->getTypeSizeInBits(ConvertToScalar);
-  unsigned ConvertEltCount =
-      (OriginalSize + ConvertScalarSize - 1) / ConvertScalarSize;
-
-  if (OriginalSize <= ConvertScalarSize)
-    return IntegerType::get(Mod->getContext(), ConvertScalarSize);
-
-  return VectorType::get(Type::getIntNTy(Mod->getContext(), ConvertScalarSize),
-                         ConvertEltCount, false);
-}
-
-Value *LiveRegOptimizer::convertToOptType(Instruction *V,
-                                          BasicBlock::iterator &InsertPt) {
-  FixedVectorType *VTy = cast<FixedVectorType>(V->getType());
-  Type *NewTy = calculateConvertType(V->getType());
-
-  TypeSize OriginalSize = DL->getTypeSizeInBits(VTy);
-  TypeSize NewSize = DL->getTypeSizeInBits(NewTy);
-
-  IRBuilder<> Builder(V->getParent(), InsertPt);
-  // If there is a bitsize match, we can fit the old vector into a new vector of
-  // desired type.
-  if (OriginalSize == NewSize)
-    return Builder.CreateBitCast(V, NewTy, V->getName() + ".bc");
-
-  // If there is a bitsize mismatch, we must use a wider vector.
-  assert(NewSize > OriginalSize);
-  uint64_t ExpandedVecElementCount = NewSize / VTy->getScalarSizeInBits();
-
-  SmallVector<int, 8> ShuffleMask;
-  uint64_t OriginalElementCount = VTy->getElementCount().getFixedValue();
-  for (unsigned I = 0; I < OriginalElementCount; I++)
-    ShuffleMask.push_back(I);
-
-  for (uint64_t I = OriginalElementCount; I < ExpandedVecElementCount; I++)
-    ShuffleMask.push_back(OriginalElementCount);
-
-  Value *ExpandedVec = Builder.CreateShuffleVector(V, ShuffleMask);
-  return Builder.CreateBitCast(ExpandedVec, NewTy, V->getName() + ".bc");
-}
-
-Value *LiveRegOptimizer::convertFromOptType(Type *ConvertType, Instruction *V,
-                                            BasicBlock::iterator &InsertPt,
-                                            BasicBlock *InsertBB) {
-  FixedVectorType *NewVTy = cast<FixedVectorType>(ConvertType);
-
-  TypeSize OriginalSize = DL->getTypeSizeInBits(V->getType());
-  TypeSize NewSize = DL->getTypeSizeInBits(NewVTy);
-
-  IRBuilder<> Builder(InsertBB, InsertPt);
-  // If there is a bitsize match, we simply convert back to the original type.
-  if (OriginalSize == NewSize)
-    return Builder.CreateBitCast(V, NewVTy, V->getName() + ".bc");
-
-  // If there is a bitsize mismatch, then we must have used a wider value to
-  // hold the bits.
-  assert(OriginalSize > NewSize);
-  // For wide scalars, we can just truncate the value.
-  if (!V->getType()->isVectorTy()) {
-    Instruction *Trunc = cast<Instruction>(
-        Builder.CreateTrunc(V, IntegerType::get(Mod->getContext(), NewSize)));
-    return cast<Instruction>(Builder.CreateBitCast(Trunc, NewVTy));
-  }
-
-  // For wider vectors, we must strip the MSBs to convert back to the original
-  // type.
-  VectorType *ExpandedVT = VectorType::get(
-      Type::getIntNTy(Mod->getContext(), NewVTy->getScalarSizeInBits()),
-      (OriginalSize / NewVTy->getScalarSizeInBits()), false);
-  Instruction *Converted =
-      cast<Instruction>(Builder.CreateBitCast(V, ExpandedVT));
-
-  unsigned NarrowElementCount = NewVTy->getElementCount().getFixedValue();
-  SmallVector<int, 8> ShuffleMask(NarrowElementCount);
-  std::iota(ShuffleMask.begin(), ShuffleMask.end(), 0);
-
-  return Builder.CreateShuffleVector(Converted, ShuffleMask);
-}
-
-bool LiveRegOptimizer::optimizeLiveType(Instruction *I) {
-  SmallVector<Instruction *, 4> Worklist;
-  SmallPtrSet<PHINode *, 4> PhiNodes;
-  SmallPtrSet<Instruction *, 4> Defs;
-  SmallPtrSet<Instruction *, 4> Uses;
-
-  Worklist.push_back(cast<Instruction>(I));
-  while (!Worklist.empty()) {
-    Instruction *II = Worklist.pop_back_val();
-
-    if (!Visited.insert(II).second)
-      continue;
-
-    if (!shouldReplace(II->getType()))
-      continue;
-
-    if (PHINode *Phi = dyn_cast<PHINode>(II)) {
-      PhiNodes.insert(Phi);
-      // Collect all the incoming values of problematic PHI nodes.
-      for (Value *V : Phi->incoming_values()) {
-        // Repeat the collection process for newly found PHI nodes.
-        if (PHINode *OpPhi = dyn_cast<PHINode>(V)) {
-          if (!PhiNodes.count(OpPhi) && !Visited.count(OpPhi))
-            Worklist.push_back(OpPhi);
-          continue;
-        }
-
-        Instruction *IncInst = dyn_cast<Instruction>(V);
-        // Other incoming value types (e.g. vector literals) are unhandled
-        if (!IncInst && !isa<ConstantAggregateZero>(V))
-          return false;
-
-        // Collect all other incoming values for coercion.
-        if (IncInst)
-          Defs.insert(IncInst);
-      }
-    }
-
-    // Collect all relevant uses.
-    for (User *V : II->users()) {
-      // Repeat the collection process for problematic PHI nodes.
-      if (PHINode *OpPhi = dyn_cast<PHINode>(V)) {
-        if (!PhiNodes.count(OpPhi) && !Visited.count(OpPhi))
-          Worklist.push_back(OpPhi);
-        continue;
-      }
-
-      Instruction *UseInst = cast<Instruction>(V);
-      // Collect all uses of PHINodes and any use the crosses BB boundaries.
-      if (UseInst->getParent() != II->getParent() || isa<PHINode>(II)) {
-        Uses.insert(UseInst);
-        if (!Defs.count(II) && !isa<PHINode>(II)) {
-          Defs.insert(II);
-        }
-      }
-    }
-  }
-
-  // Coerce and track the defs.
-  for (Instruction *D : Defs) {
-    if (!ValMap.contains(D)) {
-      BasicBlock::iterator InsertPt = std::next(D->getIterator());
-      Value *ConvertVal = convertToOptType(D, InsertPt);
-      assert(ConvertVal);
-      ValMap[D] = ConvertVal;
-    }
-  }
-
-  // Construct new-typed PHI nodes.
-  for (PHINode *Phi : PhiNodes) {
-    ValMap[Phi] = PHINode::Create(calculateConvertType(Phi->getType()),
-                                  Phi->getNumIncomingValues(),
-                                  Phi->getName() + ".tc", Phi->getIterator());
-  }
-
-  // Connect all the PHI nodes with their new incoming values.
-  for (PHINode *Phi : PhiNodes) {
-    PHINode *NewPhi = cast<PHINode>(ValMap[Phi]);
-    bool MissingIncVal = false;
-    for (int I = 0, E = Phi->getNumIncomingValues(); I < E; I++) {
-      Value *IncVal = Phi->getIncomingValue(I);
-      if (isa<ConstantAggregateZero>(IncVal)) {
-        Type *NewType = calculateConvertType(Phi->getType());
-        NewPhi->addIncoming(ConstantInt::get(NewType, 0, false),
-                            Phi->getIncomingBlock(I));
-      } else if (ValMap.contains(IncVal))
-        NewPhi->addIncoming(ValMap[IncVal], Phi->getIncomingBlock(I));
-      else
-        MissingIncVal = true;
-    }
-    DeadInstrs.insert(MissingIncVal ? cast<Instruction>(ValMap[Phi]) : Phi);
-  }
-  // Coerce back to the original type and replace the uses.
-  for (Instruction *U : Uses) {
-    // Replace all converted operands for a use.
-    for (auto [OpIdx, Op] : enumerate(U->operands())) {
-      if (ValMap.contains(Op)) {
-        Value *NewVal = nullptr;
-        if (BBUseValMap.contains(U->getParent()) &&
-            BBUseValMap[U->getParent()].contains(ValMap[Op]))
-          NewVal = BBUseValMap[U->getParent()][ValMap[Op]];
-        else {
-          BasicBlock::iterator InsertPt = U->getParent()->getFirstNonPHIIt();
-          NewVal =
-              convertFromOptType(Op->getType(), cast<Instruction>(ValMap[Op]),
-                                 InsertPt, U->getParent());
-          BBUseValMap[U->getParent()][ValMap[Op]] = NewVal;
-        }
-        assert(NewVal);
-        U->setOperand(OpIdx, NewVal);
-      }
-    }
-  }
-
-  return true;
-}
-
-void LiveRegOptimizer::removeDeadInstrs() {
-  // Remove instrs that have been marked dead after type-coercion.
-  for (auto *I : DeadInstrs) {
-    I->replaceAllUsesWith(PoisonValue::get(I->getType()));
-    I->eraseFromParent();
-  }
-}
-
 bool AMDGPULateCodeGenPrepare::canWidenScalarExtLoad(LoadInst &LI) const {
   unsigned AS = LI.getPointerAddressSpace();
   // Skip non-constant address space.
@@ -410,7 +119,7 @@ bool AMDGPULateCodeGenPrepare::canWidenScalarExtLoad(LoadInst &LI) const {
   // Skip non-simple loads.
   if (!LI.isSimple())
     return false;
-  Type *Ty = LI.getType();
+  auto *Ty = LI.getType();
   // Skip aggregate types.
   if (Ty->isAggregateType())
     return false;

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index f50a18ccc2188..9162e110aa10b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -1197,10 +1197,10 @@ bool GCNPassConfig::addPreISel() {
   AMDGPUPassConfig::addPreISel();
 
   if (TM->getOptLevel() > CodeGenOptLevel::None)
-    addPass(createSinkingPass());
+    addPass(createAMDGPULateCodeGenPreparePass());
 
   if (TM->getOptLevel() > CodeGenOptLevel::None)
-    addPass(createAMDGPULateCodeGenPreparePass());
+    addPass(createSinkingPass());
 
   // Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit
   // regions formed by them.

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll
deleted file mode 100644
index 83cb92210ec84..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll
+++ /dev/null
@@ -1,636 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx906 < %s | FileCheck --check-prefix=GFX906 %s
-
-define amdgpu_kernel void @v3i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
-; GFX906-LABEL: v3i8_liveout:
-; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX906-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX906-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; GFX906-NEXT:    v_mov_b32_e32 v3, 8
-; GFX906-NEXT:    v_mov_b32_e32 v5, 16
-; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX906-NEXT:    global_load_dword v4, v2, s[4:5]
-; GFX906-NEXT:    v_mov_b32_e32 v1, 0xff
-; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v0
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_and_b32_e32 v6, 0xff, v4
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v7, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX906-NEXT:    v_or3_b32 v4, v6, v7, v4
-; GFX906-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; GFX906-NEXT:    s_cbranch_execz .LBB0_2
-; GFX906-NEXT:  ; %bb.1: ; %bb.1
-; GFX906-NEXT:    global_load_dword v0, v2, s[6:7]
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_and_b32_e32 v2, 0xff, v0
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v3, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX906-NEXT:    v_or3_b32 v4, v2, v3, v0
-; GFX906-NEXT:  .LBB0_2: ; %bb.2
-; GFX906-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v4
-; GFX906-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX906-NEXT:    v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_and_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX906-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX906-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX906-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX906-NEXT:    v_mov_b32_e32 v1, 0
-; GFX906-NEXT:    global_store_short v1, v0, s[2:3]
-; GFX906-NEXT:    global_store_byte_d16_hi v1, v0, s[2:3] offset:2
-; GFX906-NEXT:    s_endpgm
-entry:
-  %idx = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep1 = getelementptr <3 x i8>, ptr addrspace(1) %src1, i32 %idx
-  %vec1 = load <3 x i8>, ptr addrspace(1) %gep1
-  %gep2 = getelementptr <3 x i8>, ptr addrspace(1) %src2, i32 %idx
-  %vec2 = load <3 x i8>, ptr addrspace(1) %gep2
-  %cmp = icmp ult i32 %idx, 15
-  br i1 %cmp, label %bb.1, label %bb.2
-bb.1:
-  br label %bb.2
-
-bb.2:
-  %tmp5 = phi <3 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ]
-  store <3 x i8> %tmp5, ptr addrspace(1) %dst, align 4
-  ret void
-}
-
-define amdgpu_kernel void @v4i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
-; GFX906-LABEL: v4i8_liveout:
-; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX906-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX906-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v0
-; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX906-NEXT:    global_load_dword v1, v2, s[4:5]
-; GFX906-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; GFX906-NEXT:    s_cbranch_execz .LBB1_2
-; GFX906-NEXT:  ; %bb.1: ; %bb.1
-; GFX906-NEXT:    global_load_dword v1, v2, s[6:7]
-; GFX906-NEXT:  .LBB1_2: ; %bb.2
-; GFX906-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX906-NEXT:    v_mov_b32_e32 v0, 0
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    global_store_dword v0, v1, s[2:3]
-; GFX906-NEXT:    s_endpgm
-entry:
-  %idx = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
-  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
-  %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
-  %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
-  %cmp = icmp ult i32 %idx, 15
-  br i1 %cmp, label %bb.1, label %bb.2
-bb.1:
-  br label %bb.2
-
-bb.2:
-  %tmp5 = phi <4 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ]
-  store <4 x i8> %tmp5, ptr addrspace(1) %dst, align 4
-  ret void
-}
-
-define amdgpu_kernel void @v5i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
-; GFX906-LABEL: v5i8_liveout:
-; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX906-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX906-NEXT:    v_lshlrev_b32_e32 v3, 3, v0
-; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v0
-; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX906-NEXT:    global_load_dwordx2 v[1:2], v3, s[4:5]
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX906-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; GFX906-NEXT:    s_cbranch_execz .LBB2_2
-; GFX906-NEXT:  ; %bb.1: ; %bb.1
-; GFX906-NEXT:    global_load_dwordx2 v[1:2], v3, s[6:7]
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX906-NEXT:  .LBB2_2: ; %bb.2
-; GFX906-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX906-NEXT:    v_mov_b32_e32 v4, 0
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v1
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v1
-; GFX906-NEXT:    global_store_byte v4, v1, s[2:3]
-; GFX906-NEXT:    global_store_byte v4, v0, s[2:3] offset:1
-; GFX906-NEXT:    global_store_byte_d16_hi v4, v1, s[2:3] offset:2
-; GFX906-NEXT:    global_store_byte v4, v3, s[2:3] offset:3
-; GFX906-NEXT:    global_store_byte v4, v2, s[2:3] offset:4
-; GFX906-NEXT:    s_endpgm
-entry:
-  %idx = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep1 = getelementptr <5 x i8>, ptr addrspace(1) %src1, i32 %idx
-  %vec1 = load <5 x i8>, ptr addrspace(1) %gep1
-  %gep2 = getelementptr <5 x i8>, ptr addrspace(1) %src2, i32 %idx
-  %vec2 = load <5 x i8>, ptr addrspace(1) %gep2
-  %cmp = icmp ult i32 %idx, 15
-  br i1 %cmp, label %bb.1, label %bb.2
-bb.1:
-  br label %bb.2
-
-bb.2:
-  %tmp5 = phi <5 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ]
-  store <5 x i8> %tmp5, ptr addrspace(1) %dst, align 4
-  ret void
-}
-
-define amdgpu_kernel void @v8i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
-; GFX906-LABEL: v8i8_liveout:
-; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX906-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX906-NEXT:    v_lshlrev_b32_e32 v3, 3, v0
-; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v0
-; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX906-NEXT:    global_load_dwordx2 v[1:2], v3, s[4:5]
-; GFX906-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; GFX906-NEXT:    s_cbranch_execz .LBB3_2
-; GFX906-NEXT:  ; %bb.1: ; %bb.1
-; GFX906-NEXT:    global_load_dwordx2 v[1:2], v3, s[6:7]
-; GFX906-NEXT:  .LBB3_2: ; %bb.2
-; GFX906-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX906-NEXT:    v_mov_b32_e32 v0, 0
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    global_store_dwordx2 v0, v[1:2], s[2:3]
-; GFX906-NEXT:    s_endpgm
-entry:
-  %idx = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx
-  %vec1 = load <8 x i8>, ptr addrspace(1) %gep1
-  %gep2 = getelementptr <8 x i8>, ptr addrspace(1) %src2, i32 %idx
-  %vec2 = load <8 x i8>, ptr addrspace(1) %gep2
-  %cmp = icmp ult i32 %idx, 15
-  br i1 %cmp, label %bb.1, label %bb.2
-bb.1:
-  br label %bb.2
-
-bb.2:
-  %tmp5 = phi <8 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ]
-  store <8 x i8> %tmp5, ptr addrspace(1) %dst, align 4
-  ret void
-}
-
-define amdgpu_kernel void @v16i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
-; GFX906-LABEL: v16i8_liveout:
-; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX906-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 4, v0
-; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v0
-; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX906-NEXT:    global_load_dwordx4 v[1:4], v5, s[4:5]
-; GFX906-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; GFX906-NEXT:    s_cbranch_execz .LBB4_2
-; GFX906-NEXT:  ; %bb.1: ; %bb.1
-; GFX906-NEXT:    global_load_dwordx4 v[1:4], v5, s[6:7]
-; GFX906-NEXT:  .LBB4_2: ; %bb.2
-; GFX906-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX906-NEXT:    v_mov_b32_e32 v0, 0
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    global_store_dwordx4 v0, v[1:4], s[2:3]
-; GFX906-NEXT:    s_endpgm
-entry:
-  %idx = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep1 = getelementptr <16 x i8>, ptr addrspace(1) %src1, i32 %idx
-  %vec1 = load <16 x i8>, ptr addrspace(1) %gep1
-  %gep2 = getelementptr <16 x i8>, ptr addrspace(1) %src2, i32 %idx
-  %vec2 = load <16 x i8>, ptr addrspace(1) %gep2
-  %cmp = icmp ult i32 %idx, 15
-  br i1 %cmp, label %bb.1, label %bb.2
-bb.1:
-  br label %bb.2
-
-bb.2:
-  %tmp5 = phi <16 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ]
-  store <16 x i8> %tmp5, ptr addrspace(1) %dst, align 4
-  ret void
-}
-
-define amdgpu_kernel void @v32i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
-; GFX906-LABEL: v32i8_liveout:
-; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX906-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX906-NEXT:    v_lshlrev_b32_e32 v9, 5, v0
-; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v0
-; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX906-NEXT:    global_load_dwordx4 v[1:4], v9, s[4:5]
-; GFX906-NEXT:    global_load_dwordx4 v[5:8], v9, s[4:5] offset:16
-; GFX906-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; GFX906-NEXT:    s_cbranch_execz .LBB5_2
-; GFX906-NEXT:  ; %bb.1: ; %bb.1
-; GFX906-NEXT:    global_load_dwordx4 v[1:4], v9, s[6:7]
-; GFX906-NEXT:    global_load_dwordx4 v[5:8], v9, s[6:7] offset:16
-; GFX906-NEXT:  .LBB5_2: ; %bb.2
-; GFX906-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX906-NEXT:    v_mov_b32_e32 v0, 0
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    global_store_dwordx4 v0, v[1:4], s[2:3]
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    global_store_dwordx4 v0, v[5:8], s[2:3] offset:16
-; GFX906-NEXT:    s_endpgm
-entry:
-  %idx = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep1 = getelementptr <32 x i8>, ptr addrspace(1) %src1, i32 %idx
-  %vec1 = load <32 x i8>, ptr addrspace(1) %gep1
-  %gep2 = getelementptr <32 x i8>, ptr addrspace(1) %src2, i32 %idx
-  %vec2 = load <32 x i8>, ptr addrspace(1) %gep2
-  %cmp = icmp ult i32 %idx, 15
-  br i1 %cmp, label %bb.1, label %bb.2
-bb.1:
-  br label %bb.2
-
-bb.2:
-  %tmp5 = phi <32 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ]
-  store <32 x i8> %tmp5, ptr addrspace(1) %dst, align 4
-  ret void
-}
-
-define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
-; GFX906-LABEL: v256i8_liveout:
-; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
-; GFX906-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
-; GFX906-NEXT:    s_mov_b32 s10, -1
-; GFX906-NEXT:    s_mov_b32 s11, 0xe00000
-; GFX906-NEXT:    s_add_u32 s8, s8, s3
-; GFX906-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX906-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX906-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
-; GFX906-NEXT:    s_addc_u32 s9, s9, 0
-; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v0
-; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX906-NEXT:    global_load_dwordx4 v[5:8], v4, s[4:5]
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    buffer_store_dword v5, off, s[8:11], 0 ; 4-byte Folded Spill
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    buffer_store_dword v6, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v7, off, s[8:11], 0 offset:8 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v8, off, s[8:11], 0 offset:12 ; 4-byte Folded Spill
-; GFX906-NEXT:    global_load_dwordx4 v[5:8], v4, s[4:5] offset:16
-; GFX906-NEXT:    s_nop 0
-; GFX906-NEXT:    global_load_dwordx4 v[9:12], v4, s[4:5] offset:32
-; GFX906-NEXT:    global_load_dwordx4 v[13:16], v4, s[4:5] offset:48
-; GFX906-NEXT:    global_load_dwordx4 v[17:20], v4, s[4:5] offset:64
-; GFX906-NEXT:    global_load_dwordx4 v[21:24], v4, s[4:5] offset:80
-; GFX906-NEXT:    global_load_dwordx4 v[25:28], v4, s[4:5] offset:96
-; GFX906-NEXT:    global_load_dwordx4 v[29:32], v4, s[4:5] offset:112
-; GFX906-NEXT:    global_load_dwordx4 v[33:36], v4, s[4:5] offset:128
-; GFX906-NEXT:    global_load_dwordx4 v[37:40], v4, s[4:5] offset:144
-; GFX906-NEXT:    global_load_dwordx4 v[41:44], v4, s[4:5] offset:160
-; GFX906-NEXT:    global_load_dwordx4 v[45:48], v4, s[4:5] offset:176
-; GFX906-NEXT:    global_load_dwordx4 v[49:52], v4, s[4:5] offset:192
-; GFX906-NEXT:    global_load_dwordx4 v[53:56], v4, s[4:5] offset:208
-; GFX906-NEXT:    global_load_dwordx4 v[57:60], v4, s[4:5] offset:224
-; GFX906-NEXT:    global_load_dwordx4 v[0:3], v4, s[4:5] offset:240
-; GFX906-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; GFX906-NEXT:    s_cbranch_execz .LBB6_2
-; GFX906-NEXT:  ; %bb.1: ; %bb.1
-; GFX906-NEXT:    global_load_dwordx4 v[0:3], v4, s[6:7]
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 ; 4-byte Folded Spill
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    buffer_store_dword v1, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:8 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:12 ; 4-byte Folded Spill
-; GFX906-NEXT:    global_load_dwordx4 v[5:8], v4, s[6:7] offset:16
-; GFX906-NEXT:    global_load_dwordx4 v[9:12], v4, s[6:7] offset:32
-; GFX906-NEXT:    global_load_dwordx4 v[13:16], v4, s[6:7] offset:48
-; GFX906-NEXT:    global_load_dwordx4 v[17:20], v4, s[6:7] offset:64
-; GFX906-NEXT:    global_load_dwordx4 v[21:24], v4, s[6:7] offset:80
-; GFX906-NEXT:    global_load_dwordx4 v[25:28], v4, s[6:7] offset:96
-; GFX906-NEXT:    global_load_dwordx4 v[29:32], v4, s[6:7] offset:112
-; GFX906-NEXT:    global_load_dwordx4 v[33:36], v4, s[6:7] offset:128
-; GFX906-NEXT:    global_load_dwordx4 v[37:40], v4, s[6:7] offset:144
-; GFX906-NEXT:    global_load_dwordx4 v[41:44], v4, s[6:7] offset:160
-; GFX906-NEXT:    global_load_dwordx4 v[45:48], v4, s[6:7] offset:176
-; GFX906-NEXT:    global_load_dwordx4 v[49:52], v4, s[6:7] offset:192
-; GFX906-NEXT:    global_load_dwordx4 v[53:56], v4, s[6:7] offset:208
-; GFX906-NEXT:    global_load_dwordx4 v[57:60], v4, s[6:7] offset:224
-; GFX906-NEXT:    global_load_dwordx4 v[0:3], v4, s[6:7] offset:240
-; GFX906-NEXT:  .LBB6_2: ; %bb.2
-; GFX906-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:16 ; 4-byte Folded Spill
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    buffer_store_dword v1, off, s[8:11], 0 offset:20 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:24 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:28 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_mov_b32_e32 v0, v57
-; GFX906-NEXT:    v_mov_b32_e32 v1, v58
-; GFX906-NEXT:    v_mov_b32_e32 v2, v59
-; GFX906-NEXT:    v_mov_b32_e32 v3, v60
-; GFX906-NEXT:    v_mov_b32_e32 v60, v56
-; GFX906-NEXT:    v_mov_b32_e32 v59, v55
-; GFX906-NEXT:    v_mov_b32_e32 v58, v54
-; GFX906-NEXT:    v_mov_b32_e32 v57, v53
-; GFX906-NEXT:    v_mov_b32_e32 v56, v52
-; GFX906-NEXT:    v_mov_b32_e32 v55, v51
-; GFX906-NEXT:    v_mov_b32_e32 v54, v50
-; GFX906-NEXT:    v_mov_b32_e32 v53, v49
-; GFX906-NEXT:    v_mov_b32_e32 v52, v48
-; GFX906-NEXT:    v_mov_b32_e32 v51, v47
-; GFX906-NEXT:    v_mov_b32_e32 v50, v46
-; GFX906-NEXT:    v_mov_b32_e32 v49, v45
-; GFX906-NEXT:    v_mov_b32_e32 v48, v44
-; GFX906-NEXT:    v_mov_b32_e32 v47, v43
-; GFX906-NEXT:    v_mov_b32_e32 v46, v42
-; GFX906-NEXT:    v_mov_b32_e32 v45, v41
-; GFX906-NEXT:    v_mov_b32_e32 v44, v40
-; GFX906-NEXT:    v_mov_b32_e32 v43, v39
-; GFX906-NEXT:    v_mov_b32_e32 v42, v38
-; GFX906-NEXT:    v_mov_b32_e32 v41, v37
-; GFX906-NEXT:    v_mov_b32_e32 v40, v36
-; GFX906-NEXT:    v_mov_b32_e32 v39, v35
-; GFX906-NEXT:    v_mov_b32_e32 v38, v34
-; GFX906-NEXT:    v_mov_b32_e32 v37, v33
-; GFX906-NEXT:    v_mov_b32_e32 v36, v32
-; GFX906-NEXT:    v_mov_b32_e32 v35, v31
-; GFX906-NEXT:    v_mov_b32_e32 v34, v30
-; GFX906-NEXT:    v_mov_b32_e32 v33, v29
-; GFX906-NEXT:    v_mov_b32_e32 v32, v28
-; GFX906-NEXT:    v_mov_b32_e32 v31, v27
-; GFX906-NEXT:    v_mov_b32_e32 v30, v26
-; GFX906-NEXT:    v_mov_b32_e32 v29, v25
-; GFX906-NEXT:    v_mov_b32_e32 v28, v24
-; GFX906-NEXT:    v_mov_b32_e32 v27, v23
-; GFX906-NEXT:    v_mov_b32_e32 v26, v22
-; GFX906-NEXT:    v_mov_b32_e32 v25, v21
-; GFX906-NEXT:    v_mov_b32_e32 v24, v20
-; GFX906-NEXT:    v_mov_b32_e32 v23, v19
-; GFX906-NEXT:    v_mov_b32_e32 v22, v18
-; GFX906-NEXT:    v_mov_b32_e32 v21, v17
-; GFX906-NEXT:    v_mov_b32_e32 v20, v16
-; GFX906-NEXT:    v_mov_b32_e32 v19, v15
-; GFX906-NEXT:    v_mov_b32_e32 v18, v14
-; GFX906-NEXT:    v_mov_b32_e32 v17, v13
-; GFX906-NEXT:    v_mov_b32_e32 v16, v12
-; GFX906-NEXT:    v_mov_b32_e32 v15, v11
-; GFX906-NEXT:    v_mov_b32_e32 v14, v10
-; GFX906-NEXT:    v_mov_b32_e32 v13, v9
-; GFX906-NEXT:    v_mov_b32_e32 v12, v8
-; GFX906-NEXT:    v_mov_b32_e32 v11, v7
-; GFX906-NEXT:    v_mov_b32_e32 v10, v6
-; GFX906-NEXT:    v_mov_b32_e32 v9, v5
-; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:4 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v7, off, s[8:11], 0 offset:8 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v8, off, s[8:11], 0 offset:12 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_mov_b32_e32 v4, 0
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    global_store_dwordx4 v4, v[5:8], s[2:3]
-; GFX906-NEXT:    global_store_dwordx4 v4, v[9:12], s[2:3] offset:16
-; GFX906-NEXT:    global_store_dwordx4 v4, v[13:16], s[2:3] offset:32
-; GFX906-NEXT:    global_store_dwordx4 v4, v[17:20], s[2:3] offset:48
-; GFX906-NEXT:    global_store_dwordx4 v4, v[21:24], s[2:3] offset:64
-; GFX906-NEXT:    global_store_dwordx4 v4, v[25:28], s[2:3] offset:80
-; GFX906-NEXT:    global_store_dwordx4 v4, v[29:32], s[2:3] offset:96
-; GFX906-NEXT:    global_store_dwordx4 v4, v[33:36], s[2:3] offset:112
-; GFX906-NEXT:    global_store_dwordx4 v4, v[37:40], s[2:3] offset:128
-; GFX906-NEXT:    global_store_dwordx4 v4, v[41:44], s[2:3] offset:144
-; GFX906-NEXT:    global_store_dwordx4 v4, v[45:48], s[2:3] offset:160
-; GFX906-NEXT:    global_store_dwordx4 v4, v[49:52], s[2:3] offset:176
-; GFX906-NEXT:    global_store_dwordx4 v4, v[53:56], s[2:3] offset:192
-; GFX906-NEXT:    global_store_dwordx4 v4, v[57:60], s[2:3] offset:208
-; GFX906-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:224
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:16 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_nop 0
-; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:20 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:24 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:28 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:240
-; GFX906-NEXT:    s_endpgm
-entry:
-  %idx = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx
-  %vec1 = load <256 x i8>, ptr addrspace(1) %gep1
-  %gep2 = getelementptr <8 x i8>, ptr addrspace(1) %src2, i32 %idx
-  %vec2 = load <256 x i8>, ptr addrspace(1) %gep2
-  %cmp = icmp ult i32 %idx, 15
-  br i1 %cmp, label %bb.1, label %bb.2
-bb.1:
-  br label %bb.2
-
-bb.2:
-  %tmp5 = phi <256 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ]
-  store <256 x i8> %tmp5, ptr addrspace(1) %dst, align 4
-  ret void
-}
-
-
-define amdgpu_kernel void @repeat_successor(i32 %in, ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
-; GFX906-LABEL: repeat_successor:
-; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_load_dword s2, s[0:1], 0x24
-; GFX906-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
-; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX906-NEXT:    s_cmp_lt_i32 s2, 3
-; GFX906-NEXT:    s_cbranch_scc0 .LBB7_3
-; GFX906-NEXT:  ; %bb.1: ; %LeafBlock
-; GFX906-NEXT:    s_cmp_ge_i32 s2, 1
-; GFX906-NEXT:    s_cbranch_scc0 .LBB7_6
-; GFX906-NEXT:  ; %bb.2:
-; GFX906-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX906-NEXT:    global_load_dword v0, v0, s[4:5]
-; GFX906-NEXT:    s_branch .LBB7_5
-; GFX906-NEXT:  .LBB7_3: ; %LeafBlock5
-; GFX906-NEXT:    s_cmp_eq_u32 s2, 3
-; GFX906-NEXT:    s_cbranch_scc0 .LBB7_6
-; GFX906-NEXT:  ; %bb.4: ; %sw.bb5
-; GFX906-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX906-NEXT:    global_load_dword v0, v0, s[6:7]
-; GFX906-NEXT:  .LBB7_5: ; %return.sink.split
-; GFX906-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x3c
-; GFX906-NEXT:    v_mov_b32_e32 v1, 0
-; GFX906-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX906-NEXT:    global_store_dword v1, v0, s[0:1]
-; GFX906-NEXT:  .LBB7_6: ; %return
-; GFX906-NEXT:    s_endpgm
-entry:
-  %idx = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
-  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
-  %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
-  %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
-  switch i32 %in, label %return [
-    i32 1, label %return.sink.split
-    i32 2, label %return.sink.split
-    i32 3, label %sw.bb5
-  ]
-
-sw.bb5:
-  br label %return.sink.split
-
-return.sink.split:
-  %tmp5 = phi <4 x i8> [ %vec2, %sw.bb5 ], [ %vec1, %entry ], [ %vec1, %entry ]
-  store <4 x i8> %tmp5, ptr addrspace(1) %dst, align 4
-  ret void
-
-return:
-  ret void
-}
-
-define amdgpu_kernel void @v8i8_phi_chain(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst0, ptr addrspace(1) nocapture %dst1) {
-; GFX906-LABEL: v8i8_phi_chain:
-; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
-; GFX906-NEXT:    v_lshlrev_b32_e32 v3, 3, v0
-; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v0
-; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX906-NEXT:    global_load_dwordx2 v[1:2], v3, s[0:1]
-; GFX906-NEXT:    s_xor_b64 s[0:1], vcc, -1
-; GFX906-NEXT:    s_and_saveexec_b64 s[8:9], vcc
-; GFX906-NEXT:    s_cbranch_execz .LBB8_2
-; GFX906-NEXT:  ; %bb.1: ; %bb.1
-; GFX906-NEXT:    global_load_dwordx2 v[1:2], v3, s[2:3]
-; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 7, v0
-; GFX906-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
-; GFX906-NEXT:    s_and_b64 s[2:3], exec, vcc
-; GFX906-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
-; GFX906-NEXT:  .LBB8_2: ; %Flow
-; GFX906-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX906-NEXT:    s_and_saveexec_b64 s[2:3], s[0:1]
-; GFX906-NEXT:    s_cbranch_execz .LBB8_4
-; GFX906-NEXT:  ; %bb.3: ; %bb.2
-; GFX906-NEXT:    v_mov_b32_e32 v0, 0
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    global_store_dwordx2 v0, v[1:2], s[4:5]
-; GFX906-NEXT:  .LBB8_4: ; %bb.3
-; GFX906-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX906-NEXT:    v_mov_b32_e32 v0, 0
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    global_store_dwordx2 v0, v[1:2], s[6:7]
-; GFX906-NEXT:    s_endpgm
-entry:
-  %idx = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx
-  %vec1 = load <8 x i8>, ptr addrspace(1) %gep1
-  %gep2 = getelementptr <8 x i8>, ptr addrspace(1) %src2, i32 %idx
-  %vec2 = load <8 x i8>, ptr addrspace(1) %gep2
-  %cmp = icmp ult i32 %idx, 15
-  br i1 %cmp, label %bb.1, label %bb.2
-bb.1:
-  %cmp2 = icmp ult i32 %idx, 7
-  br i1 %cmp2, label %bb.2, label %bb.3
-
-bb.2:
-  %tmp5 = phi <8 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ]
-  store <8 x i8> %tmp5, ptr addrspace(1) %dst0, align 4
-  br label %bb.3
-
-bb.3:
-  %tmp7 = phi <8 x i8> [ %vec2, %bb.1], [%tmp5, %bb.2]
-  store <8 x i8> %tmp7, ptr addrspace(1) %dst1, align 4
-  ret void
-}
-
-define amdgpu_kernel void @v8i8_multi_block(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst0, ptr addrspace(1) nocapture %dst1) {
-; GFX906-LABEL: v8i8_multi_block:
-; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
-; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 3, v0
-; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v0
-; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX906-NEXT:    global_load_dwordx2 v[3:4], v5, s[0:1]
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_mov_b32_e32 v1, v3
-; GFX906-NEXT:    v_mov_b32_e32 v2, v4
-; GFX906-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; GFX906-NEXT:    s_cbranch_execz .LBB9_4
-; GFX906-NEXT:  ; %bb.1: ; %bb.1
-; GFX906-NEXT:    global_load_dwordx2 v[1:2], v5, s[2:3]
-; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 7, v0
-; GFX906-NEXT:    s_and_saveexec_b64 s[2:3], vcc
-; GFX906-NEXT:    s_cbranch_execz .LBB9_3
-; GFX906-NEXT:  ; %bb.2: ; %bb.2
-; GFX906-NEXT:    v_mov_b32_e32 v0, 0
-; GFX906-NEXT:    global_store_dwordx2 v0, v[3:4], s[4:5]
-; GFX906-NEXT:  .LBB9_3: ; %Flow
-; GFX906-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX906-NEXT:  .LBB9_4: ; %bb.3
-; GFX906-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX906-NEXT:    v_mov_b32_e32 v0, 0
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    global_store_dwordx2 v0, v[1:2], s[6:7]
-; GFX906-NEXT:    s_endpgm
-entry:
-  %idx = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx
-  %vec1 = load <8 x i8>, ptr addrspace(1) %gep1
-  %gep2 = getelementptr <8 x i8>, ptr addrspace(1) %src2, i32 %idx
-  %vec2 = load <8 x i8>, ptr addrspace(1) %gep2
-  %cmp = icmp ult i32 %idx, 15
-  br i1 %cmp, label %bb.1, label %bb.3
-bb.1:
-  %cmp2 = icmp ult i32 %idx, 7
-  br i1 %cmp2, label %bb.2, label %bb.3
-
-bb.2:
-  store <8 x i8> %vec1, ptr addrspace(1) %dst0, align 4
-  br label %bb.3
-
-bb.3:
-  %tmp5 = phi <8 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ], [ %vec2, %bb.2]
-  store <8 x i8> %tmp5, ptr addrspace(1) %dst1, align 4
-  ret void
-}
-
-define amdgpu_kernel void @v32i8_loop_carried(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
-; GFX906-LABEL: v32i8_loop_carried:
-; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX906-NEXT:    v_lshlrev_b32_e32 v1, 5, v0
-; GFX906-NEXT:    v_mov_b32_e32 v3, 8
-; GFX906-NEXT:    v_mov_b32_e32 v2, 0xff
-; GFX906-NEXT:    v_cmp_le_u32_e32 vcc, 15, v0
-; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX906-NEXT:    global_load_dword v1, v1, s[2:3]
-; GFX906-NEXT:    s_mov_b64 s[2:3], 0
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v0, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX906-NEXT:    v_and_or_b32 v0, v1, v2, v0
-; GFX906-NEXT:    v_mov_b32_e32 v2, 24
-; GFX906-NEXT:  .LBB10_1: ; %bb.1
-; GFX906-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX906-NEXT:    v_and_b32_e32 v3, 0xff, v1
-; GFX906-NEXT:    s_and_b64 s[4:5], exec, vcc
-; GFX906-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX906-NEXT:    s_or_b64 s[2:3], s[4:5], s[2:3]
-; GFX906-NEXT:    v_or3_b32 v1, v0, v3, v1
-; GFX906-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX906-NEXT:    s_cbranch_execnz .LBB10_1
-; GFX906-NEXT:  ; %bb.2: ; %bb.2.loopexit
-; GFX906-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX906-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX906-NEXT:    v_mov_b32_e32 v0, 0
-; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX906-NEXT:    global_store_dword v0, v1, s[0:1]
-; GFX906-NEXT:    s_endpgm
-entry:
-  %idx = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep1 = getelementptr <32 x i8>, ptr addrspace(1) %src1, i32 %idx
-  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
-  br label %bb.1
-
-bb.1:
-  %temp = phi <4 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ]
-  %vec2 = shufflevector <4 x i8> %vec1, <4 x i8> %temp, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-  %cmp = icmp ult i32 %idx, 15
-  br i1 %cmp, label %bb.1, label %bb.2
-  br label %bb.2
-
-bb.2:
-  store <4 x i8> %vec2, ptr addrspace(1) %dst, align 4
-  ret void
-}
-
-
-declare i32 @llvm.amdgcn.workitem.id.x()
-

diff  --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-break-large-phis.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-break-large-phis.ll
index 11772d252a16f..93b9aeac3cd3f 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-break-large-phis.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-break-large-phis.ll
@@ -987,8 +987,8 @@ define amdgpu_kernel void @phi_v7i16_switch(<7 x i16> %in, ptr %out, i8 %cond) {
 ; OPT-NEXT:  entry:
 ; OPT-NEXT:    [[X:%.*]] = insertelement <7 x i16> [[IN:%.*]], i16 3, i32 3
 ; OPT-NEXT:    switch i8 [[COND:%.*]], label [[ELSE:%.*]] [
-; OPT-NEXT:      i8 0, label [[THEN_1:%.*]]
-; OPT-NEXT:      i8 3, label [[THEN_2:%.*]]
+; OPT-NEXT:    i8 0, label [[THEN_1:%.*]]
+; OPT-NEXT:    i8 3, label [[THEN_2:%.*]]
 ; OPT-NEXT:    ]
 ; OPT:       then.1:
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE0:%.*]] = shufflevector <7 x i16> [[X]], <7 x i16> poison, <2 x i32> <i32 0, i32 1>
@@ -1025,8 +1025,8 @@ define amdgpu_kernel void @phi_v7i16_switch(<7 x i16> %in, ptr %out, i8 %cond) {
 ; NOOPT-NEXT:  entry:
 ; NOOPT-NEXT:    [[X:%.*]] = insertelement <7 x i16> [[IN:%.*]], i16 3, i32 3
 ; NOOPT-NEXT:    switch i8 [[COND:%.*]], label [[ELSE:%.*]] [
-; NOOPT-NEXT:      i8 0, label [[THEN_1:%.*]]
-; NOOPT-NEXT:      i8 3, label [[THEN_2:%.*]]
+; NOOPT-NEXT:    i8 0, label [[THEN_1:%.*]]
+; NOOPT-NEXT:    i8 3, label [[THEN_2:%.*]]
 ; NOOPT-NEXT:    ]
 ; NOOPT:       then.1:
 ; NOOPT-NEXT:    br label [[FINALLY:%.*]]

diff  --git a/llvm/test/CodeGen/AMDGPU/dagcomb-extract-vec-elt-
diff erent-sizes.ll b/llvm/test/CodeGen/AMDGPU/dagcomb-extract-vec-elt-
diff erent-sizes.ll
index 1e5ec361d154c..53acbb6a7bceb 100644
--- a/llvm/test/CodeGen/AMDGPU/dagcomb-extract-vec-elt-
diff erent-sizes.ll
+++ b/llvm/test/CodeGen/AMDGPU/dagcomb-extract-vec-elt-
diff erent-sizes.ll
@@ -8,30 +8,29 @@ define amdgpu_kernel void @eggs(i1 %arg, ptr addrspace(1) %arg1, ptr %arg2, ptr
 ; CHECK:       ; %bb.0: ; %bb
 ; CHECK-NEXT:    s_load_dword s0, s[4:5], 0x0
 ; CHECK-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x8
-; CHECK-NEXT:    v_mov_b32_e32 v0, 0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_bitcmp0_b32 s0, 0
 ; CHECK-NEXT:    s_cbranch_scc1 .LBB0_2
 ; CHECK-NEXT:  ; %bb.1: ; %bb10
-; CHECK-NEXT:    global_load_dwordx2 v[8:9], v0, s[8:9]
+; CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-NEXT:    global_load_dwordx2 v[0:1], v0, s[8:9]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_and_b32_e32 v7, 0xff, v8
-; CHECK-NEXT:    v_bfe_u32 v6, v8, 8, 8
-; CHECK-NEXT:    v_bfe_u32 v5, v8, 16, 8
-; CHECK-NEXT:    v_lshrrev_b32_e32 v4, 24, v8
-; CHECK-NEXT:    v_and_b32_e32 v3, 0xff, v9
-; CHECK-NEXT:    v_bfe_u32 v2, v9, 8, 8
-; CHECK-NEXT:    v_bfe_u32 v1, v9, 16, 8
-; CHECK-NEXT:    v_lshrrev_b32_e32 v0, 24, v9
+; CHECK-NEXT:    v_lshrrev_b32_e32 v7, 8, v0
+; CHECK-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
+; CHECK-NEXT:    v_lshrrev_b32_e32 v5, 24, v0
+; CHECK-NEXT:    v_lshrrev_b32_e32 v4, 8, v1
+; CHECK-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; CHECK-NEXT:    v_lshrrev_b32_e32 v2, 24, v1
 ; CHECK-NEXT:    s_branch .LBB0_3
 ; CHECK-NEXT:  .LBB0_2:
-; CHECK-NEXT:    v_mov_b32_e32 v1, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v3, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v4, 0
+; CHECK-NEXT:    v_mov_b32_e32 v1, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v5, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v6, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v7, 0
+; CHECK-NEXT:    v_mov_b32_e32 v0, 0
 ; CHECK-NEXT:  .LBB0_3: ; %bb41
 ; CHECK-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x48
 ; CHECK-NEXT:    v_mov_b32_e32 v8, s10
@@ -48,16 +47,16 @@ define amdgpu_kernel void @eggs(i1 %arg, ptr addrspace(1) %arg1, ptr %arg2, ptr
 ; CHECK-NEXT:    v_mov_b32_e32 v19, s21
 ; CHECK-NEXT:    v_mov_b32_e32 v20, s22
 ; CHECK-NEXT:    v_mov_b32_e32 v21, s23
-; CHECK-NEXT:    flat_store_byte v[8:9], v7
-; CHECK-NEXT:    flat_store_byte v[10:11], v6
-; CHECK-NEXT:    flat_store_byte v[12:13], v5
-; CHECK-NEXT:    flat_store_byte v[14:15], v4
-; CHECK-NEXT:    flat_store_byte v[16:17], v3
-; CHECK-NEXT:    flat_store_byte v[18:19], v2
-; CHECK-NEXT:    flat_store_byte v[20:21], v1
+; CHECK-NEXT:    flat_store_byte v[8:9], v0
+; CHECK-NEXT:    flat_store_byte v[10:11], v7
+; CHECK-NEXT:    flat_store_byte v[12:13], v6
+; CHECK-NEXT:    flat_store_byte v[14:15], v5
+; CHECK-NEXT:    flat_store_byte v[16:17], v1
+; CHECK-NEXT:    flat_store_byte v[18:19], v4
+; CHECK-NEXT:    flat_store_byte v[20:21], v3
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1]
-; CHECK-NEXT:    flat_store_byte v[2:3], v0
+; CHECK-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; CHECK-NEXT:    flat_store_byte v[0:1], v2
 ; CHECK-NEXT:    s_endpgm
 bb:
   br i1 %arg, label %bb10, label %bb41

diff  --git a/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll b/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll
index efbbe2b27f10f..6dabd8c0b83ea 100644
--- a/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll
@@ -13,9 +13,9 @@ define <4 x i16> @vec_8xi16_extract_4xi16(ptr addrspace(1) %p0, ptr addrspace(1)
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
 ; SI-NEXT:    s_mov_b32 s5, s6
-; SI-NEXT:    buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 glc
+; SI-NEXT:    buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 offset:2 glc
+; SI-NEXT:    buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:2 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_load_ushort v6, v[2:3], s[4:7], 0 addr64 offset:4 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
@@ -30,25 +30,27 @@ define <4 x i16> @vec_8xi16_extract_4xi16(ptr addrspace(1) %p0, ptr addrspace(1)
 ; SI-NEXT:    buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 offset:14 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v7
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; SI-NEXT:    v_or_b32_e32 v3, v6, v2
-; SI-NEXT:    v_or_b32_e32 v2, v4, v5
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
+; SI-NEXT:    v_or_b32_e32 v2, v6, v2
+; SI-NEXT:    v_or_b32_e32 v3, v5, v3
 ; SI-NEXT:    s_mov_b64 vcc, exec
 ; SI-NEXT:    s_cbranch_execz .LBB0_3
 ; SI-NEXT:    s_branch .LBB0_4
 ; SI-NEXT:  .LBB0_2:
-; SI-NEXT:    ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5
+; SI-NEXT:    ; implicit-def: $vgpr3
+; SI-NEXT:    ; implicit-def: $vgpr4
+; SI-NEXT:    ; implicit-def: $vgpr2
 ; SI-NEXT:    s_mov_b64 vcc, 0
 ; SI-NEXT:  .LBB0_3: ; %T
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
 ; SI-NEXT:    s_mov_b32 s5, s6
-; SI-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 glc
+; SI-NEXT:    buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:2 glc
+; SI-NEXT:    buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:2 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:4 glc
+; SI-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:4 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:6 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
@@ -61,29 +63,29 @@ define <4 x i16> @vec_8xi16_extract_4xi16(ptr addrspace(1) %p0, ptr addrspace(1)
 ; SI-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:14 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v5
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
-; SI-NEXT:    v_or_b32_e32 v3, v4, v0
-; SI-NEXT:    v_or_b32_e32 v2, v2, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v4
+; SI-NEXT:    v_or_b32_e32 v2, v2, v0
+; SI-NEXT:    v_or_b32_e32 v3, v3, v1
 ; SI-NEXT:  .LBB0_4: ; %exit
-; SI-NEXT:    v_ashrrev_i32_e32 v0, 16, v2
-; SI-NEXT:    v_bfe_i32 v1, v2, 0, 16
-; SI-NEXT:    v_bfe_i32 v2, v3, 0, 16
-; SI-NEXT:    v_mov_b32_e32 v3, 0xffff0000
-; SI-NEXT:    v_bfrev_b32_e32 v4, 1
-; SI-NEXT:    v_mov_b32_e32 v5, 0xffff
-; SI-NEXT:    v_mov_b32_e32 v6, 0x8000
+; SI-NEXT:    v_bfe_i32 v0, v3, 0, 16
+; SI-NEXT:    v_bfe_i32 v1, v4, 0, 16
+; SI-NEXT:    v_bfe_i32 v2, v2, 0, 16
+; SI-NEXT:    v_mov_b32_e32 v3, 0xffff
+; SI-NEXT:    v_mov_b32_e32 v4, 0x8000
+; SI-NEXT:    v_mov_b32_e32 v5, 0xffff0000
+; SI-NEXT:    v_bfrev_b32_e32 v6, 1
 ; SI-NEXT:    v_mov_b32_e32 v7, 0xffff8000
 ; SI-NEXT:    v_cmp_lt_i32_e32 vcc, -1, v0
-; SI-NEXT:    v_cndmask_b32_e32 v4, v3, v4, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc
 ; SI-NEXT:    v_cmp_lt_i32_e32 vcc, -1, v1
-; SI-NEXT:    v_cndmask_b32_e32 v0, v5, v6, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v1, v5, v6, vcc
 ; SI-NEXT:    v_cmp_lt_i32_e32 vcc, -1, v2
-; SI-NEXT:    v_cndmask_b32_e32 v1, -1, v7, vcc
-; SI-NEXT:    v_or_b32_e32 v0, v0, v4
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v1
-; SI-NEXT:    v_or_b32_e32 v2, v3, v2
-; SI-NEXT:    v_alignbit_b32 v1, v2, v4, 16
+; SI-NEXT:    v_cndmask_b32_e32 v2, -1, v7, vcc
+; SI-NEXT:    v_or_b32_e32 v0, v0, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v2
+; SI-NEXT:    v_or_b32_e32 v2, v3, v4
+; SI-NEXT:    v_alignbit_b32 v1, v2, v1, 16
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: vec_8xi16_extract_4xi16:
@@ -178,23 +180,26 @@ define <4 x i16> @vec_8xi16_extract_4xi16_2(ptr addrspace(1) %p0, ptr addrspace(
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:6 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:8 glc
+; SI-NEXT:    buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 offset:8 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 offset:10 glc
+; SI-NEXT:    buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:10 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_load_ushort v6, v[2:3], s[4:7], 0 addr64 offset:12 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 offset:14 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
-; SI-NEXT:    v_or_b32_e32 v5, v6, v2
-; SI-NEXT:    v_or_b32_e32 v4, v4, v3
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v4
+; SI-NEXT:    v_or_b32_e32 v3, v6, v3
+; SI-NEXT:    v_or_b32_e32 v5, v5, v7
 ; SI-NEXT:    s_mov_b64 vcc, exec
 ; SI-NEXT:    s_cbranch_execz .LBB1_3
 ; SI-NEXT:    s_branch .LBB1_4
 ; SI-NEXT:  .LBB1_2:
-; SI-NEXT:    ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5
+; SI-NEXT:    ; implicit-def: $vgpr5
+; SI-NEXT:    ; implicit-def: $vgpr4
+; SI-NEXT:    ; implicit-def: $vgpr3
+; SI-NEXT:    ; implicit-def: $vgpr2
 ; SI-NEXT:    s_mov_b64 vcc, 0
 ; SI-NEXT:  .LBB1_3: ; %T
 ; SI-NEXT:    s_mov_b32 s6, 0
@@ -209,39 +214,39 @@ define <4 x i16> @vec_8xi16_extract_4xi16_2(ptr addrspace(1) %p0, ptr addrspace(
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:6 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:8 glc
+; SI-NEXT:    buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:8 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:10 glc
+; SI-NEXT:    buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:10 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:12 glc
+; SI-NEXT:    buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:12 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:14 glc
+; SI-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:14 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
-; SI-NEXT:    v_or_b32_e32 v5, v4, v0
-; SI-NEXT:    v_or_b32_e32 v4, v2, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v4
+; SI-NEXT:    v_or_b32_e32 v3, v3, v0
+; SI-NEXT:    v_or_b32_e32 v5, v5, v1
 ; SI-NEXT:  .LBB1_4: ; %exit
-; SI-NEXT:    v_ashrrev_i32_e32 v2, 16, v4
-; SI-NEXT:    v_ashr_i64 v[0:1], v[4:5], 48
+; SI-NEXT:    v_bfe_i32 v0, v5, 0, 16
 ; SI-NEXT:    v_bfe_i32 v1, v4, 0, 16
-; SI-NEXT:    v_bfe_i32 v3, v5, 0, 16
-; SI-NEXT:    v_mov_b32_e32 v4, 0xffff0000
-; SI-NEXT:    v_bfrev_b32_e32 v5, 1
-; SI-NEXT:    v_mov_b32_e32 v6, 0xffff
-; SI-NEXT:    v_mov_b32_e32 v7, 0x8000
-; SI-NEXT:    v_cmp_lt_i32_e32 vcc, -1, v2
-; SI-NEXT:    v_cndmask_b32_e32 v8, v4, v5, vcc
+; SI-NEXT:    v_bfe_i32 v3, v3, 0, 16
+; SI-NEXT:    v_bfe_i32 v2, v2, 0, 16
+; SI-NEXT:    v_mov_b32_e32 v4, 0xffff
+; SI-NEXT:    v_mov_b32_e32 v5, 0x8000
+; SI-NEXT:    v_mov_b32_e32 v6, 0xffff0000
+; SI-NEXT:    v_bfrev_b32_e32 v7, 1
+; SI-NEXT:    v_cmp_lt_i32_e32 vcc, -1, v0
+; SI-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
 ; SI-NEXT:    v_cmp_lt_i32_e32 vcc, -1, v1
 ; SI-NEXT:    v_cndmask_b32_e32 v1, v6, v7, vcc
 ; SI-NEXT:    v_cmp_lt_i32_e32 vcc, -1, v3
-; SI-NEXT:    v_cndmask_b32_e32 v2, v6, v7, vcc
-; SI-NEXT:    v_cmp_lt_i32_e32 vcc, -1, v0
 ; SI-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
-; SI-NEXT:    v_or_b32_e32 v0, v1, v8
-; SI-NEXT:    v_or_b32_e32 v2, v2, v3
-; SI-NEXT:    v_alignbit_b32 v1, v2, v8, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_cmp_lt_i32_e32 vcc, -1, v2
+; SI-NEXT:    v_cndmask_b32_e32 v4, v6, v7, vcc
+; SI-NEXT:    v_or_b32_e32 v0, v0, v1
+; SI-NEXT:    v_or_b32_e32 v2, v3, v4
+; SI-NEXT:    v_alignbit_b32 v1, v2, v1, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v4
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: vec_8xi16_extract_4xi16_2:
@@ -494,9 +499,9 @@ define <4 x i16> @vec_16xi16_extract_4xi16(ptr addrspace(1) %p0, ptr addrspace(1
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
 ; SI-NEXT:    s_mov_b32 s5, s6
-; SI-NEXT:    buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 glc
+; SI-NEXT:    buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 offset:2 glc
+; SI-NEXT:    buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:2 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_load_ushort v6, v[2:3], s[4:7], 0 addr64 offset:4 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
@@ -527,25 +532,27 @@ define <4 x i16> @vec_16xi16_extract_4xi16(ptr addrspace(1) %p0, ptr addrspace(1
 ; SI-NEXT:    buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 offset:30 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v7
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; SI-NEXT:    v_or_b32_e32 v3, v6, v2
-; SI-NEXT:    v_or_b32_e32 v2, v4, v5
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
+; SI-NEXT:    v_or_b32_e32 v2, v6, v2
+; SI-NEXT:    v_or_b32_e32 v3, v5, v3
 ; SI-NEXT:    s_mov_b64 vcc, exec
 ; SI-NEXT:    s_cbranch_execz .LBB3_3
 ; SI-NEXT:    s_branch .LBB3_4
 ; SI-NEXT:  .LBB3_2:
-; SI-NEXT:    ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
+; SI-NEXT:    ; implicit-def: $vgpr3
+; SI-NEXT:    ; implicit-def: $vgpr4
+; SI-NEXT:    ; implicit-def: $vgpr2
 ; SI-NEXT:    s_mov_b64 vcc, 0
 ; SI-NEXT:  .LBB3_3: ; %T
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
 ; SI-NEXT:    s_mov_b32 s5, s6
-; SI-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 glc
+; SI-NEXT:    buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:2 glc
+; SI-NEXT:    buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:2 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:4 glc
+; SI-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:4 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:6 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
@@ -574,29 +581,29 @@ define <4 x i16> @vec_16xi16_extract_4xi16(ptr addrspace(1) %p0, ptr addrspace(1
 ; SI-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:30 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v5
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
-; SI-NEXT:    v_or_b32_e32 v3, v4, v0
-; SI-NEXT:    v_or_b32_e32 v2, v2, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v4
+; SI-NEXT:    v_or_b32_e32 v2, v2, v0
+; SI-NEXT:    v_or_b32_e32 v3, v3, v1
 ; SI-NEXT:  .LBB3_4: ; %exit
-; SI-NEXT:    v_ashrrev_i32_e32 v0, 16, v2
-; SI-NEXT:    v_bfe_i32 v1, v2, 0, 16
-; SI-NEXT:    v_bfe_i32 v2, v3, 0, 16
-; SI-NEXT:    v_mov_b32_e32 v3, 0xffff0000
-; SI-NEXT:    v_bfrev_b32_e32 v4, 1
-; SI-NEXT:    v_mov_b32_e32 v5, 0xffff
-; SI-NEXT:    v_mov_b32_e32 v6, 0x8000
+; SI-NEXT:    v_bfe_i32 v0, v3, 0, 16
+; SI-NEXT:    v_bfe_i32 v1, v4, 0, 16
+; SI-NEXT:    v_bfe_i32 v2, v2, 0, 16
+; SI-NEXT:    v_mov_b32_e32 v3, 0xffff
+; SI-NEXT:    v_mov_b32_e32 v4, 0x8000
+; SI-NEXT:    v_mov_b32_e32 v5, 0xffff0000
+; SI-NEXT:    v_bfrev_b32_e32 v6, 1
 ; SI-NEXT:    v_mov_b32_e32 v7, 0xffff8000
 ; SI-NEXT:    v_cmp_lt_i32_e32 vcc, -1, v0
-; SI-NEXT:    v_cndmask_b32_e32 v4, v3, v4, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc
 ; SI-NEXT:    v_cmp_lt_i32_e32 vcc, -1, v1
-; SI-NEXT:    v_cndmask_b32_e32 v0, v5, v6, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v1, v5, v6, vcc
 ; SI-NEXT:    v_cmp_lt_i32_e32 vcc, -1, v2
-; SI-NEXT:    v_cndmask_b32_e32 v1, -1, v7, vcc
-; SI-NEXT:    v_or_b32_e32 v0, v0, v4
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v1
-; SI-NEXT:    v_or_b32_e32 v2, v3, v2
-; SI-NEXT:    v_alignbit_b32 v1, v2, v4, 16
+; SI-NEXT:    v_cndmask_b32_e32 v2, -1, v7, vcc
+; SI-NEXT:    v_or_b32_e32 v0, v0, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v2
+; SI-NEXT:    v_or_b32_e32 v2, v3, v4
+; SI-NEXT:    v_alignbit_b32 v1, v2, v1, 16
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: vec_16xi16_extract_4xi16:
@@ -703,13 +710,13 @@ define <4 x i16> @vec_16xi16_extract_4xi16_2(ptr addrspace(1) %p0, ptr addrspace
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:6 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:8 glc
+; SI-NEXT:    buffer_load_ushort v6, v[2:3], s[4:7], 0 addr64 offset:8 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 offset:10 glc
+; SI-NEXT:    buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:10 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_ushort v6, v[2:3], s[4:7], 0 addr64 offset:12 glc
+; SI-NEXT:    buffer_load_ushort v7, v[2:3], s[4:7], 0 addr64 offset:12 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_ushort v7, v[2:3], s[4:7], 0 addr64 offset:14 glc
+; SI-NEXT:    buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 offset:14 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:16 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
@@ -727,15 +734,18 @@ define <4 x i16> @vec_16xi16_extract_4xi16_2(ptr addrspace(1) %p0, ptr addrspace
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 offset:30 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v7
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
-; SI-NEXT:    v_or_b32_e32 v5, v6, v2
-; SI-NEXT:    v_or_b32_e32 v4, v4, v3
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
+; SI-NEXT:    v_or_b32_e32 v2, v7, v2
+; SI-NEXT:    v_or_b32_e32 v3, v6, v3
 ; SI-NEXT:    s_mov_b64 vcc, exec
 ; SI-NEXT:    s_cbranch_execz .LBB4_3
 ; SI-NEXT:    s_branch .LBB4_4
 ; SI-NEXT:  .LBB4_2:
-; SI-NEXT:    ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
+; SI-NEXT:    ; implicit-def: $vgpr3
+; SI-NEXT:    ; implicit-def: $vgpr4
+; SI-NEXT:    ; implicit-def: $vgpr2
+; SI-NEXT:    ; implicit-def: $vgpr5
 ; SI-NEXT:    s_mov_b64 vcc, 0
 ; SI-NEXT:  .LBB4_3: ; %T
 ; SI-NEXT:    s_mov_b32 s6, 0
@@ -750,11 +760,11 @@ define <4 x i16> @vec_16xi16_extract_4xi16_2(ptr addrspace(1) %p0, ptr addrspace
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:6 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:8 glc
+; SI-NEXT:    buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:8 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:10 glc
+; SI-NEXT:    buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:10 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:12 glc
+; SI-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:12 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:14 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
@@ -775,29 +785,29 @@ define <4 x i16> @vec_16xi16_extract_4xi16_2(ptr addrspace(1) %p0, ptr addrspace
 ; SI-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:30 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v5
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
-; SI-NEXT:    v_or_b32_e32 v5, v4, v0
-; SI-NEXT:    v_or_b32_e32 v4, v2, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v4
+; SI-NEXT:    v_or_b32_e32 v2, v2, v0
+; SI-NEXT:    v_or_b32_e32 v3, v3, v1
 ; SI-NEXT:  .LBB4_4: ; %exit
-; SI-NEXT:    v_ashrrev_i32_e32 v2, 16, v4
-; SI-NEXT:    v_ashr_i64 v[0:1], v[4:5], 48
+; SI-NEXT:    v_bfe_i32 v0, v3, 0, 16
 ; SI-NEXT:    v_bfe_i32 v1, v4, 0, 16
+; SI-NEXT:    v_bfe_i32 v2, v2, 0, 16
 ; SI-NEXT:    v_bfe_i32 v3, v5, 0, 16
-; SI-NEXT:    v_mov_b32_e32 v4, 0xffff0000
-; SI-NEXT:    v_bfrev_b32_e32 v5, 1
-; SI-NEXT:    v_mov_b32_e32 v6, 0xffff
-; SI-NEXT:    v_mov_b32_e32 v7, 0x8000
-; SI-NEXT:    v_cmp_lt_i32_e32 vcc, -1, v2
-; SI-NEXT:    v_cndmask_b32_e32 v8, v4, v5, vcc
+; SI-NEXT:    v_mov_b32_e32 v4, 0xffff
+; SI-NEXT:    v_mov_b32_e32 v5, 0x8000
+; SI-NEXT:    v_mov_b32_e32 v6, 0xffff0000
+; SI-NEXT:    v_bfrev_b32_e32 v7, 1
+; SI-NEXT:    v_cmp_lt_i32_e32 vcc, -1, v0
+; SI-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
 ; SI-NEXT:    v_cmp_lt_i32_e32 vcc, -1, v1
 ; SI-NEXT:    v_cndmask_b32_e32 v1, v6, v7, vcc
+; SI-NEXT:    v_cmp_lt_i32_e32 vcc, -1, v2
+; SI-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc
 ; SI-NEXT:    v_cmp_lt_i32_e32 vcc, -1, v3
-; SI-NEXT:    v_cndmask_b32_e32 v2, v6, v7, vcc
-; SI-NEXT:    v_cmp_lt_i32_e32 vcc, -1, v0
-; SI-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
-; SI-NEXT:    v_or_b32_e32 v0, v1, v8
+; SI-NEXT:    v_cndmask_b32_e32 v3, v6, v7, vcc
+; SI-NEXT:    v_or_b32_e32 v0, v0, v1
 ; SI-NEXT:    v_or_b32_e32 v2, v2, v3
-; SI-NEXT:    v_alignbit_b32 v1, v2, v8, 16
+; SI-NEXT:    v_alignbit_b32 v1, v2, v1, 16
 ; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1195,21 +1205,21 @@ define amdgpu_gfx <8 x i16> @vec_16xi16_extract_8xi16_0(i1 inreg %cond, ptr addr
 ; SI-NEXT:    s_mov_b32 s39, 0xf000
 ; SI-NEXT:    s_mov_b32 s36, s38
 ; SI-NEXT:    s_mov_b32 s37, s38
-; SI-NEXT:    buffer_load_ushort v6, v[2:3], s[36:39], 0 addr64 glc
+; SI-NEXT:    buffer_load_ushort v9, v[2:3], s[36:39], 0 addr64 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_ushort v4, v[2:3], s[36:39], 0 addr64 offset:2 glc
+; SI-NEXT:    buffer_load_ushort v5, v[2:3], s[36:39], 0 addr64 offset:2 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_ushort v7, v[2:3], s[36:39], 0 addr64 offset:4 glc
+; SI-NEXT:    buffer_load_ushort v10, v[2:3], s[36:39], 0 addr64 offset:4 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_ushort v5, v[2:3], s[36:39], 0 addr64 offset:6 glc
+; SI-NEXT:    buffer_load_ushort v4, v[2:3], s[36:39], 0 addr64 offset:6 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_load_ushort v8, v[2:3], s[36:39], 0 addr64 offset:8 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_ushort v9, v[2:3], s[36:39], 0 addr64 offset:10 glc
+; SI-NEXT:    buffer_load_ushort v6, v[2:3], s[36:39], 0 addr64 offset:10 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_ushort v10, v[2:3], s[36:39], 0 addr64 offset:12 glc
+; SI-NEXT:    buffer_load_ushort v11, v[2:3], s[36:39], 0 addr64 offset:12 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_ushort v11, v[2:3], s[36:39], 0 addr64 offset:14 glc
+; SI-NEXT:    buffer_load_ushort v7, v[2:3], s[36:39], 0 addr64 offset:14 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_load_ushort v12, v[2:3], s[36:39], 0 addr64 offset:16 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
@@ -1227,39 +1237,46 @@ define amdgpu_gfx <8 x i16> @vec_16xi16_extract_8xi16_0(i1 inreg %cond, ptr addr
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_load_ushort v2, v[2:3], s[36:39], 0 addr64 offset:30 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v11
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v9
-; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v4
-; SI-NEXT:    v_or_b32_e32 v5, v10, v2
-; SI-NEXT:    v_or_b32_e32 v4, v8, v3
-; SI-NEXT:    v_or_b32_e32 v3, v7, v9
-; SI-NEXT:    v_or_b32_e32 v2, v6, v11
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v7
+; SI-NEXT:    v_lshlrev_b32_e32 v12, 16, v6
+; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v4
+; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v5
+; SI-NEXT:    v_or_b32_e32 v3, v11, v2
+; SI-NEXT:    v_or_b32_e32 v8, v8, v12
+; SI-NEXT:    v_or_b32_e32 v2, v10, v13
+; SI-NEXT:    v_or_b32_e32 v9, v9, v14
 ; SI-NEXT:    s_mov_b64 vcc, exec
 ; SI-NEXT:    s_cbranch_execz .LBB7_3
 ; SI-NEXT:    s_branch .LBB7_4
 ; SI-NEXT:  .LBB7_2:
-; SI-NEXT:    ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
+; SI-NEXT:    ; implicit-def: $vgpr9
+; SI-NEXT:    ; implicit-def: $vgpr5
+; SI-NEXT:    ; implicit-def: $vgpr2
+; SI-NEXT:    ; implicit-def: $vgpr4
+; SI-NEXT:    ; implicit-def: $vgpr8
+; SI-NEXT:    ; implicit-def: $vgpr6
+; SI-NEXT:    ; implicit-def: $vgpr3
+; SI-NEXT:    ; implicit-def: $vgpr7
 ; SI-NEXT:    s_mov_b64 vcc, 0
 ; SI-NEXT:  .LBB7_3: ; %T
 ; SI-NEXT:    s_mov_b32 s39, 0xf000
 ; SI-NEXT:    s_mov_b32 s36, s38
 ; SI-NEXT:    s_mov_b32 s37, s38
-; SI-NEXT:    buffer_load_ushort v2, v[0:1], s[36:39], 0 addr64 glc
+; SI-NEXT:    buffer_load_ushort v9, v[0:1], s[36:39], 0 addr64 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_ushort v3, v[0:1], s[36:39], 0 addr64 offset:2 glc
+; SI-NEXT:    buffer_load_ushort v5, v[0:1], s[36:39], 0 addr64 offset:2 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_ushort v6, v[0:1], s[36:39], 0 addr64 offset:4 glc
+; SI-NEXT:    buffer_load_ushort v2, v[0:1], s[36:39], 0 addr64 offset:4 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_load_ushort v4, v[0:1], s[36:39], 0 addr64 offset:6 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_ushort v7, v[0:1], s[36:39], 0 addr64 offset:8 glc
+; SI-NEXT:    buffer_load_ushort v8, v[0:1], s[36:39], 0 addr64 offset:8 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_ushort v5, v[0:1], s[36:39], 0 addr64 offset:10 glc
+; SI-NEXT:    buffer_load_ushort v6, v[0:1], s[36:39], 0 addr64 offset:10 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_ushort v8, v[0:1], s[36:39], 0 addr64 offset:12 glc
+; SI-NEXT:    buffer_load_ushort v3, v[0:1], s[36:39], 0 addr64 offset:12 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_ushort v9, v[0:1], s[36:39], 0 addr64 offset:14 glc
+; SI-NEXT:    buffer_load_ushort v7, v[0:1], s[36:39], 0 addr64 offset:14 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_load_ushort v10, v[0:1], s[36:39], 0 addr64 offset:16 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
@@ -1277,52 +1294,52 @@ define amdgpu_gfx <8 x i16> @vec_16xi16_extract_8xi16_0(i1 inreg %cond, ptr addr
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_load_ushort v0, v[0:1], s[36:39], 0 addr64 offset:30 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v9
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v5
-; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v4
-; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v3
-; SI-NEXT:    v_or_b32_e32 v5, v8, v0
-; SI-NEXT:    v_or_b32_e32 v4, v7, v1
-; SI-NEXT:    v_or_b32_e32 v3, v6, v9
+; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v7
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v6
+; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v4
+; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v5
+; SI-NEXT:    v_or_b32_e32 v3, v3, v0
+; SI-NEXT:    v_or_b32_e32 v8, v8, v1
 ; SI-NEXT:    v_or_b32_e32 v2, v2, v10
+; SI-NEXT:    v_or_b32_e32 v9, v9, v11
 ; SI-NEXT:  .LBB7_4: ; %exit
-; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v2
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v4
-; SI-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v5
-; SI-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v9
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v5
+; SI-NEXT:    v_and_b32_e32 v5, 0xffff, v8
+; SI-NEXT:    v_and_b32_e32 v6, 0xffff, v6
 ; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; SI-NEXT:    v_and_b32_e32 v7, 0xffff, v7
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; SI-NEXT:    v_and_b32_e32 v4, 0xffff, v4
 ; SI-NEXT:    s_movk_i32 s34, 0x3800
-; SI-NEXT:    v_mov_b32_e32 v8, 0x3d000000
-; SI-NEXT:    v_mov_b32_e32 v9, 0x39000000
-; SI-NEXT:    v_mov_b32_e32 v10, 0x3d00
-; SI-NEXT:    v_mov_b32_e32 v11, 0x3900
+; SI-NEXT:    v_mov_b32_e32 v8, 0x3d00
+; SI-NEXT:    v_mov_b32_e32 v9, 0x3900
+; SI-NEXT:    v_mov_b32_e32 v10, 0x3d000000
+; SI-NEXT:    v_mov_b32_e32 v11, 0x39000000
 ; SI-NEXT:    v_cmp_lt_u32_e32 vcc, s34, v0
-; SI-NEXT:    v_cndmask_b32_e32 v12, v8, v9, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v0, v8, v9, vcc
 ; SI-NEXT:    v_cmp_lt_u32_e32 vcc, s34, v1
-; SI-NEXT:    v_cndmask_b32_e32 v0, v10, v11, vcc
-; SI-NEXT:    v_cmp_lt_u32_e32 vcc, s34, v2
-; SI-NEXT:    v_cndmask_b32_e32 v13, v8, v9, vcc
-; SI-NEXT:    v_cmp_lt_u32_e32 vcc, s34, v4
 ; SI-NEXT:    v_cndmask_b32_e32 v1, v10, v11, vcc
-; SI-NEXT:    v_cmp_lt_u32_e32 vcc, s34, v6
-; SI-NEXT:    v_cndmask_b32_e32 v14, v8, v9, vcc
 ; SI-NEXT:    v_cmp_lt_u32_e32 vcc, s34, v5
-; SI-NEXT:    v_cndmask_b32_e32 v2, v10, v11, vcc
-; SI-NEXT:    v_cmp_lt_u32_e32 vcc, s34, v7
 ; SI-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
+; SI-NEXT:    v_cmp_lt_u32_e32 vcc, s34, v6
+; SI-NEXT:    v_cndmask_b32_e32 v12, v10, v11, vcc
 ; SI-NEXT:    v_cmp_lt_u32_e32 vcc, s34, v3
-; SI-NEXT:    v_cndmask_b32_e32 v3, v10, v11, vcc
-; SI-NEXT:    v_or_b32_e32 v0, v0, v12
-; SI-NEXT:    v_or_b32_e32 v4, v1, v13
-; SI-NEXT:    v_or_b32_e32 v6, v2, v14
-; SI-NEXT:    v_or_b32_e32 v2, v3, v5
-; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
-; SI-NEXT:    v_alignbit_b32 v1, v2, v12, 16
-; SI-NEXT:    v_alignbit_b32 v5, v6, v13, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v14
+; SI-NEXT:    v_cndmask_b32_e32 v3, v8, v9, vcc
+; SI-NEXT:    v_cmp_lt_u32_e32 vcc, s34, v7
+; SI-NEXT:    v_cndmask_b32_e32 v7, v10, v11, vcc
+; SI-NEXT:    v_cmp_lt_u32_e32 vcc, s34, v2
+; SI-NEXT:    v_cndmask_b32_e32 v2, v8, v9, vcc
+; SI-NEXT:    v_cmp_lt_u32_e32 vcc, s34, v4
+; SI-NEXT:    v_cndmask_b32_e32 v8, v10, v11, vcc
+; SI-NEXT:    v_or_b32_e32 v0, v0, v1
+; SI-NEXT:    v_or_b32_e32 v4, v5, v12
+; SI-NEXT:    v_or_b32_e32 v6, v3, v7
+; SI-NEXT:    v_or_b32_e32 v2, v2, v8
+; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v8
+; SI-NEXT:    v_alignbit_b32 v1, v2, v1, 16
+; SI-NEXT:    v_alignbit_b32 v5, v6, v12, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: vec_16xi16_extract_8xi16_0:

diff  --git a/llvm/test/CodeGen/AMDGPU/extract-subvector.ll b/llvm/test/CodeGen/AMDGPU/extract-subvector.ll
index 36a93bd2511ce..15abf44f3a0ea 100644
--- a/llvm/test/CodeGen/AMDGPU/extract-subvector.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract-subvector.ll
@@ -1,82 +1,26 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
 ; RUN: llc -mtriple=amdgcn-- -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s
 
+; GCN-LABEL: extract_2xi16
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: v_bfe_i32
+; GCN: v_bfe_i32
+
 define <2 x i16> @extract_2xi16(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i1 %c0) {
-; GCN-LABEL: extract_2xi16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_and_b32_e32 v4, 1, v4
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
-; GCN-NEXT:    s_xor_b64 s[4:5], vcc, -1
-; GCN-NEXT:    ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7
-; GCN-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
-; GCN-NEXT:    s_xor_b64 s[4:5], exec, s[6:7]
-; GCN-NEXT:    s_cbranch_execz .LBB0_2
-; GCN-NEXT:  ; %bb.1: ; %F
-; GCN-NEXT:    s_mov_b32 s10, 0
-; GCN-NEXT:    s_mov_b32 s11, 0xf000
-; GCN-NEXT:    s_mov_b32 s8, s10
-; GCN-NEXT:    s_mov_b32 s9, s10
-; GCN-NEXT:    buffer_load_ushort v0, v[2:3], s[8:11], 0 addr64 glc
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_load_ushort v1, v[2:3], s[8:11], 0 addr64 offset:2 glc
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_load_ushort v4, v[2:3], s[8:11], 0 addr64 offset:4 glc
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_load_ushort v4, v[2:3], s[8:11], 0 addr64 offset:6 glc
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_load_ushort v4, v[2:3], s[8:11], 0 addr64 offset:8 glc
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_load_ushort v4, v[2:3], s[8:11], 0 addr64 offset:10 glc
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_load_ushort v4, v[2:3], s[8:11], 0 addr64 offset:12 glc
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_load_ushort v2, v[2:3], s[8:11], 0 addr64 offset:14 glc
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GCN-NEXT:    v_or_b32_e32 v4, v0, v1
-; GCN-NEXT:    ; implicit-def: $vgpr0
-; GCN-NEXT:  .LBB0_2: ; %Flow
-; GCN-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN-NEXT:    s_cbranch_execz .LBB0_4
-; GCN-NEXT:  ; %bb.3: ; %T
-; GCN-NEXT:    s_mov_b32 s10, 0
-; GCN-NEXT:    s_mov_b32 s11, 0xf000
-; GCN-NEXT:    s_mov_b32 s8, s10
-; GCN-NEXT:    s_mov_b32 s9, s10
-; GCN-NEXT:    buffer_load_ushort v2, v[0:1], s[8:11], 0 addr64 glc
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_load_ushort v3, v[0:1], s[8:11], 0 addr64 offset:2 glc
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_load_ushort v4, v[0:1], s[8:11], 0 addr64 offset:4 glc
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_load_ushort v4, v[0:1], s[8:11], 0 addr64 offset:6 glc
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_load_ushort v4, v[0:1], s[8:11], 0 addr64 offset:8 glc
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_load_ushort v4, v[0:1], s[8:11], 0 addr64 offset:10 glc
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_load_ushort v4, v[0:1], s[8:11], 0 addr64 offset:12 glc
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_load_ushort v0, v[0:1], s[8:11], 0 addr64 offset:14 glc
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v3
-; GCN-NEXT:    v_or_b32_e32 v4, v2, v0
-; GCN-NEXT:  .LBB0_4: ; %exit
-; GCN-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GCN-NEXT:    v_ashrrev_i32_e32 v0, 16, v4
-; GCN-NEXT:    v_bfe_i32 v1, v4, 0, 16
-; GCN-NEXT:    v_mov_b32_e32 v2, 0xffff
-; GCN-NEXT:    v_mov_b32_e32 v3, 0x8000
-; GCN-NEXT:    v_mov_b32_e32 v4, 0xffff8000
-; GCN-NEXT:    v_cmp_lt_i32_e32 vcc, -1, v1
-; GCN-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
-; GCN-NEXT:    v_cmp_lt_i32_e32 vcc, -1, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v2, -1, v4, vcc
-; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
-; GCN-NEXT:    v_or_b32_e32 v0, v1, v0
-; GCN-NEXT:    v_and_b32_e32 v1, 0xffff, v2
-; GCN-NEXT:    s_setpc_b64 s[30:31]
   br i1 %c0, label %T, label %F
 
 T:
@@ -95,59 +39,9 @@ exit:
   ret <2 x i16> %r2
 }
 
+; GCN-LABEL: extract_2xi64
+; GCN-COUNT-2: v_cndmask_b32
 define <2 x i64> @extract_2xi64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i1 %c0) {
-; GCN-LABEL: extract_2xi64:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_and_b32_e32 v4, 1, v4
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
-; GCN-NEXT:    s_xor_b64 s[4:5], vcc, -1
-; GCN-NEXT:    ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19
-; GCN-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
-; GCN-NEXT:    s_xor_b64 s[4:5], exec, s[6:7]
-; GCN-NEXT:    s_cbranch_execz .LBB1_2
-; GCN-NEXT:  ; %bb.1: ; %F
-; GCN-NEXT:    s_mov_b32 s10, 0
-; GCN-NEXT:    s_mov_b32 s11, 0xf000
-; GCN-NEXT:    s_mov_b32 s8, s10
-; GCN-NEXT:    s_mov_b32 s9, s10
-; GCN-NEXT:    buffer_load_dwordx4 v[4:7], v[2:3], s[8:11], 0 addr64 glc
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_load_dwordx4 v[8:11], v[2:3], s[8:11], 0 addr64 offset:16 glc
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_load_dwordx4 v[8:11], v[2:3], s[8:11], 0 addr64 offset:32 glc
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_load_dwordx4 v[0:3], v[2:3], s[8:11], 0 addr64 offset:48 glc
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    ; implicit-def: $vgpr0
-; GCN-NEXT:  .LBB1_2: ; %Flow
-; GCN-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN-NEXT:    s_cbranch_execz .LBB1_4
-; GCN-NEXT:  ; %bb.3: ; %T
-; GCN-NEXT:    s_mov_b32 s10, 0
-; GCN-NEXT:    s_mov_b32 s11, 0xf000
-; GCN-NEXT:    s_mov_b32 s8, s10
-; GCN-NEXT:    s_mov_b32 s9, s10
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[8:11], 0 addr64 glc
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[8:11], 0 addr64 offset:16 glc
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[8:11], 0 addr64 offset:32 glc
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64 offset:48 glc
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:  .LBB1_4: ; %exit
-; GCN-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v1, 0xffff8000
-; GCN-NEXT:    v_cmp_lt_i64_e32 vcc, -1, v[4:5]
-; GCN-NEXT:    v_cndmask_b32_e32 v0, -1, v1, vcc
-; GCN-NEXT:    v_cmp_lt_i64_e32 vcc, -1, v[6:7]
-; GCN-NEXT:    v_cndmask_b32_e32 v2, -1, v1, vcc
-; GCN-NEXT:    v_mov_b32_e32 v1, -1
-; GCN-NEXT:    v_mov_b32_e32 v3, -1
-; GCN-NEXT:    s_setpc_b64 s[30:31]
   br i1 %c0, label %T, label %F
 
 T:
@@ -166,65 +60,9 @@ exit:
   ret <2 x i64> %r2
 }
 
+; GCN-LABEL: extract_4xi64
+; GCN-COUNT-4: v_cndmask_b32
 define <4 x i64> @extract_4xi64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i1 %c0) {
-; GCN-LABEL: extract_4xi64:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_and_b32_e32 v4, 1, v4
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
-; GCN-NEXT:    s_xor_b64 s[4:5], vcc, -1
-; GCN-NEXT:    ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19
-; GCN-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
-; GCN-NEXT:    s_xor_b64 s[4:5], exec, s[6:7]
-; GCN-NEXT:    s_cbranch_execz .LBB2_2
-; GCN-NEXT:  ; %bb.1: ; %F
-; GCN-NEXT:    s_mov_b32 s10, 0
-; GCN-NEXT:    s_mov_b32 s11, 0xf000
-; GCN-NEXT:    s_mov_b32 s8, s10
-; GCN-NEXT:    s_mov_b32 s9, s10
-; GCN-NEXT:    buffer_load_dwordx4 v[4:7], v[2:3], s[8:11], 0 addr64 glc
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_load_dwordx4 v[8:11], v[2:3], s[8:11], 0 addr64 offset:16 glc
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_load_dwordx4 v[12:15], v[2:3], s[8:11], 0 addr64 offset:32 glc
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_load_dwordx4 v[0:3], v[2:3], s[8:11], 0 addr64 offset:48 glc
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    ; implicit-def: $vgpr0
-; GCN-NEXT:  .LBB2_2: ; %Flow
-; GCN-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN-NEXT:    s_cbranch_execz .LBB2_4
-; GCN-NEXT:  ; %bb.3: ; %T
-; GCN-NEXT:    s_mov_b32 s10, 0
-; GCN-NEXT:    s_mov_b32 s11, 0xf000
-; GCN-NEXT:    s_mov_b32 s8, s10
-; GCN-NEXT:    s_mov_b32 s9, s10
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[8:11], 0 addr64 glc
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[8:11], 0 addr64 offset:16 glc
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_load_dwordx4 v[12:15], v[0:1], s[8:11], 0 addr64 offset:32 glc
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64 offset:48 glc
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:  .LBB2_4: ; %exit
-; GCN-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v1, 0xffff8000
-; GCN-NEXT:    v_cmp_gt_i64_e32 vcc, 0, v[4:5]
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v1, -1, vcc
-; GCN-NEXT:    v_cmp_gt_i64_e32 vcc, 0, v[6:7]
-; GCN-NEXT:    v_cndmask_b32_e64 v2, v1, -1, vcc
-; GCN-NEXT:    v_cmp_gt_i64_e32 vcc, 0, v[8:9]
-; GCN-NEXT:    v_cndmask_b32_e64 v4, v1, -1, vcc
-; GCN-NEXT:    v_cmp_gt_i64_e32 vcc, 0, v[10:11]
-; GCN-NEXT:    v_cndmask_b32_e64 v6, v1, -1, vcc
-; GCN-NEXT:    v_mov_b32_e32 v1, -1
-; GCN-NEXT:    v_mov_b32_e32 v3, -1
-; GCN-NEXT:    v_mov_b32_e32 v5, -1
-; GCN-NEXT:    v_mov_b32_e32 v7, -1
-; GCN-NEXT:    s_setpc_b64 s[30:31]
   br i1 %c0, label %T, label %F
 
 T:
@@ -243,92 +81,9 @@ exit:
   ret <4 x i64> %r2
 }
 
+; GCN-LABEL: extract_8xi64
+; GCN-COUNT-8: v_cndmask_b32
 define <8 x i64> @extract_8xi64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i1 %c0) {
-; GCN-LABEL: extract_8xi64:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_and_b32_e32 v4, 1, v4
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
-; GCN-NEXT:    s_xor_b64 s[4:5], vcc, -1
-; GCN-NEXT:    ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35
-; GCN-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
-; GCN-NEXT:    s_xor_b64 s[4:5], exec, s[6:7]
-; GCN-NEXT:    s_cbranch_execz .LBB3_2
-; GCN-NEXT:  ; %bb.1: ; %F
-; GCN-NEXT:    s_mov_b32 s10, 0
-; GCN-NEXT:    s_mov_b32 s11, 0xf000
-; GCN-NEXT:    s_mov_b32 s8, s10
-; GCN-NEXT:    s_mov_b32 s9, s10
-; GCN-NEXT:    buffer_load_dwordx4 v[4:7], v[2:3], s[8:11], 0 addr64 offset:112 glc
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_load_dwordx4 v[4:7], v[2:3], s[8:11], 0 addr64 offset:96 glc
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_load_dwordx4 v[4:7], v[2:3], s[8:11], 0 addr64 offset:80 glc
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_load_dwordx4 v[4:7], v[2:3], s[8:11], 0 addr64 offset:64 glc
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_load_dwordx4 v[4:7], v[2:3], s[8:11], 0 addr64 glc
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_load_dwordx4 v[8:11], v[2:3], s[8:11], 0 addr64 offset:16 glc
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_load_dwordx4 v[12:15], v[2:3], s[8:11], 0 addr64 offset:32 glc
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_load_dwordx4 v[16:19], v[2:3], s[8:11], 0 addr64 offset:48 glc
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    ; implicit-def: $vgpr0
-; GCN-NEXT:  .LBB3_2: ; %Flow
-; GCN-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN-NEXT:    s_cbranch_execz .LBB3_4
-; GCN-NEXT:  ; %bb.3: ; %T
-; GCN-NEXT:    s_mov_b32 s10, 0
-; GCN-NEXT:    s_mov_b32 s11, 0xf000
-; GCN-NEXT:    s_mov_b32 s8, s10
-; GCN-NEXT:    s_mov_b32 s9, s10
-; GCN-NEXT:    buffer_load_dwordx4 v[2:5], v[0:1], s[8:11], 0 addr64 offset:112 glc
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_load_dwordx4 v[2:5], v[0:1], s[8:11], 0 addr64 offset:96 glc
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_load_dwordx4 v[2:5], v[0:1], s[8:11], 0 addr64 offset:80 glc
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_load_dwordx4 v[2:5], v[0:1], s[8:11], 0 addr64 offset:64 glc
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[8:11], 0 addr64 glc
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[8:11], 0 addr64 offset:16 glc
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_load_dwordx4 v[12:15], v[0:1], s[8:11], 0 addr64 offset:32 glc
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_load_dwordx4 v[16:19], v[0:1], s[8:11], 0 addr64 offset:48 glc
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:  .LBB3_4: ; %exit
-; GCN-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GCN-NEXT:    v_mov_b32_e32 v1, 0xffff8000
-; GCN-NEXT:    v_cmp_gt_i64_e32 vcc, 0, v[6:7]
-; GCN-NEXT:    v_cmp_gt_i64_e64 s[4:5], 0, v[8:9]
-; GCN-NEXT:    v_cmp_gt_i64_e64 s[6:7], 0, v[10:11]
-; GCN-NEXT:    v_cmp_gt_i64_e64 s[8:9], 0, v[12:13]
-; GCN-NEXT:    v_cmp_gt_i64_e64 s[10:11], 0, v[14:15]
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_cmp_gt_i64_e64 s[12:13], 0, v[16:17]
-; GCN-NEXT:    v_cmp_gt_i64_e64 s[14:15], 0, v[18:19]
-; GCN-NEXT:    v_cmp_gt_i64_e64 s[16:17], 0, v[4:5]
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v1, -1, s[16:17]
-; GCN-NEXT:    v_cndmask_b32_e64 v2, v1, -1, vcc
-; GCN-NEXT:    v_cndmask_b32_e64 v4, v1, -1, s[4:5]
-; GCN-NEXT:    v_cndmask_b32_e64 v6, v1, -1, s[6:7]
-; GCN-NEXT:    v_cndmask_b32_e64 v8, v1, -1, s[8:9]
-; GCN-NEXT:    v_cndmask_b32_e64 v10, v1, -1, s[10:11]
-; GCN-NEXT:    v_cndmask_b32_e64 v12, v1, -1, s[12:13]
-; GCN-NEXT:    v_cndmask_b32_e64 v14, v1, -1, s[14:15]
-; GCN-NEXT:    v_mov_b32_e32 v1, -1
-; GCN-NEXT:    v_mov_b32_e32 v3, -1
-; GCN-NEXT:    v_mov_b32_e32 v5, -1
-; GCN-NEXT:    v_mov_b32_e32 v7, -1
-; GCN-NEXT:    v_mov_b32_e32 v9, -1
-; GCN-NEXT:    v_mov_b32_e32 v11, -1
-; GCN-NEXT:    v_mov_b32_e32 v13, -1
-; GCN-NEXT:    v_mov_b32_e32 v15, -1
-; GCN-NEXT:    s_setpc_b64 s[30:31]
   br i1 %c0, label %T, label %F
 
 T:
@@ -347,59 +102,9 @@ exit:
   ret <8 x i64> %r2
 }
 
+; GCN-LABEL: extract_2xf64
+; GCN-COUNT-2: v_cndmask_b32
 define <2 x double> @extract_2xf64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i1 %c0) {
-; GCN-LABEL: extract_2xf64:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_and_b32_e32 v4, 1, v4
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
-; GCN-NEXT:    s_xor_b64 s[4:5], vcc, -1
-; GCN-NEXT:    ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19
-; GCN-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
-; GCN-NEXT:    s_xor_b64 s[4:5], exec, s[6:7]
-; GCN-NEXT:    s_cbranch_execz .LBB4_2
-; GCN-NEXT:  ; %bb.1: ; %F
-; GCN-NEXT:    s_mov_b32 s10, 0
-; GCN-NEXT:    s_mov_b32 s11, 0xf000
-; GCN-NEXT:    s_mov_b32 s8, s10
-; GCN-NEXT:    s_mov_b32 s9, s10
-; GCN-NEXT:    buffer_load_dwordx4 v[4:7], v[2:3], s[8:11], 0 addr64 glc
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_load_dwordx4 v[8:11], v[2:3], s[8:11], 0 addr64 offset:16 glc
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_load_dwordx4 v[8:11], v[2:3], s[8:11], 0 addr64 offset:32 glc
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_load_dwordx4 v[0:3], v[2:3], s[8:11], 0 addr64 offset:48 glc
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    ; implicit-def: $vgpr0
-; GCN-NEXT:  .LBB4_2: ; %Flow
-; GCN-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN-NEXT:    s_cbranch_execz .LBB4_4
-; GCN-NEXT:  ; %bb.3: ; %T
-; GCN-NEXT:    s_mov_b32 s10, 0
-; GCN-NEXT:    s_mov_b32 s11, 0xf000
-; GCN-NEXT:    s_mov_b32 s8, s10
-; GCN-NEXT:    s_mov_b32 s9, s10
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[8:11], 0 addr64 glc
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[8:11], 0 addr64 offset:16 glc
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[8:11], 0 addr64 offset:32 glc
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64 offset:48 glc
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:  .LBB4_4: ; %exit
-; GCN-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, 0xbff00000
-; GCN-NEXT:    v_cmp_lt_f64_e32 vcc, -1.0, v[4:5]
-; GCN-NEXT:    v_cndmask_b32_e64 v1, v0, -2.0, vcc
-; GCN-NEXT:    v_cmp_lt_f64_e32 vcc, -1.0, v[6:7]
-; GCN-NEXT:    v_cndmask_b32_e64 v3, v0, -2.0, vcc
-; GCN-NEXT:    v_mov_b32_e32 v0, 0
-; GCN-NEXT:    v_mov_b32_e32 v2, 0
-; GCN-NEXT:    s_setpc_b64 s[30:31]
   br i1 %c0, label %T, label %F
 
 T:
@@ -418,65 +123,9 @@ exit:
   ret <2 x double> %r2
 }
 
+; GCN-LABEL: extract_4xf64
+; GCN-COUNT-4: v_cndmask_b32
 define <4 x double> @extract_4xf64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i1 %c0) {
-; GCN-LABEL: extract_4xf64:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_and_b32_e32 v4, 1, v4
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
-; GCN-NEXT:    s_xor_b64 s[4:5], vcc, -1
-; GCN-NEXT:    ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19
-; GCN-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
-; GCN-NEXT:    s_xor_b64 s[4:5], exec, s[6:7]
-; GCN-NEXT:    s_cbranch_execz .LBB5_2
-; GCN-NEXT:  ; %bb.1: ; %F
-; GCN-NEXT:    s_mov_b32 s10, 0
-; GCN-NEXT:    s_mov_b32 s11, 0xf000
-; GCN-NEXT:    s_mov_b32 s8, s10
-; GCN-NEXT:    s_mov_b32 s9, s10
-; GCN-NEXT:    buffer_load_dwordx4 v[4:7], v[2:3], s[8:11], 0 addr64 glc
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_load_dwordx4 v[8:11], v[2:3], s[8:11], 0 addr64 offset:16 glc
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_load_dwordx4 v[12:15], v[2:3], s[8:11], 0 addr64 offset:32 glc
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_load_dwordx4 v[0:3], v[2:3], s[8:11], 0 addr64 offset:48 glc
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    ; implicit-def: $vgpr0
-; GCN-NEXT:  .LBB5_2: ; %Flow
-; GCN-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN-NEXT:    s_cbranch_execz .LBB5_4
-; GCN-NEXT:  ; %bb.3: ; %T
-; GCN-NEXT:    s_mov_b32 s10, 0
-; GCN-NEXT:    s_mov_b32 s11, 0xf000
-; GCN-NEXT:    s_mov_b32 s8, s10
-; GCN-NEXT:    s_mov_b32 s9, s10
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[8:11], 0 addr64 glc
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[8:11], 0 addr64 offset:16 glc
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_load_dwordx4 v[12:15], v[0:1], s[8:11], 0 addr64 offset:32 glc
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64 offset:48 glc
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:  .LBB5_4: ; %exit
-; GCN-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, 0xbff00000
-; GCN-NEXT:    v_cmp_nlt_f64_e32 vcc, -1.0, v[4:5]
-; GCN-NEXT:    v_cndmask_b32_e32 v1, -2.0, v0, vcc
-; GCN-NEXT:    v_cmp_nlt_f64_e32 vcc, -1.0, v[6:7]
-; GCN-NEXT:    v_cndmask_b32_e32 v3, -2.0, v0, vcc
-; GCN-NEXT:    v_cmp_nlt_f64_e32 vcc, -1.0, v[8:9]
-; GCN-NEXT:    v_cndmask_b32_e32 v5, -2.0, v0, vcc
-; GCN-NEXT:    v_cmp_nlt_f64_e32 vcc, -1.0, v[10:11]
-; GCN-NEXT:    v_cndmask_b32_e32 v7, -2.0, v0, vcc
-; GCN-NEXT:    v_mov_b32_e32 v0, 0
-; GCN-NEXT:    v_mov_b32_e32 v2, 0
-; GCN-NEXT:    v_mov_b32_e32 v4, 0
-; GCN-NEXT:    v_mov_b32_e32 v6, 0
-; GCN-NEXT:    s_setpc_b64 s[30:31]
   br i1 %c0, label %T, label %F
 
 T:
@@ -495,92 +144,9 @@ exit:
   ret <4 x double> %r2
 }
 
+; GCN-LABEL: extract_8xf64
+; GCN-COUNT-8: v_cndmask_b32
 define <8 x double> @extract_8xf64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i1 %c0) {
-; GCN-LABEL: extract_8xf64:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_and_b32_e32 v4, 1, v4
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
-; GCN-NEXT:    s_xor_b64 s[4:5], vcc, -1
-; GCN-NEXT:    ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35
-; GCN-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
-; GCN-NEXT:    s_xor_b64 s[4:5], exec, s[6:7]
-; GCN-NEXT:    s_cbranch_execz .LBB6_2
-; GCN-NEXT:  ; %bb.1: ; %F
-; GCN-NEXT:    s_mov_b32 s10, 0
-; GCN-NEXT:    s_mov_b32 s11, 0xf000
-; GCN-NEXT:    s_mov_b32 s8, s10
-; GCN-NEXT:    s_mov_b32 s9, s10
-; GCN-NEXT:    buffer_load_dwordx4 v[4:7], v[2:3], s[8:11], 0 addr64 offset:112 glc
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_load_dwordx4 v[4:7], v[2:3], s[8:11], 0 addr64 offset:96 glc
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_load_dwordx4 v[4:7], v[2:3], s[8:11], 0 addr64 offset:80 glc
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_load_dwordx4 v[4:7], v[2:3], s[8:11], 0 addr64 offset:64 glc
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_load_dwordx4 v[4:7], v[2:3], s[8:11], 0 addr64 glc
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_load_dwordx4 v[8:11], v[2:3], s[8:11], 0 addr64 offset:16 glc
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_load_dwordx4 v[12:15], v[2:3], s[8:11], 0 addr64 offset:32 glc
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_load_dwordx4 v[16:19], v[2:3], s[8:11], 0 addr64 offset:48 glc
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    ; implicit-def: $vgpr0
-; GCN-NEXT:  .LBB6_2: ; %Flow
-; GCN-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN-NEXT:    s_cbranch_execz .LBB6_4
-; GCN-NEXT:  ; %bb.3: ; %T
-; GCN-NEXT:    s_mov_b32 s10, 0
-; GCN-NEXT:    s_mov_b32 s11, 0xf000
-; GCN-NEXT:    s_mov_b32 s8, s10
-; GCN-NEXT:    s_mov_b32 s9, s10
-; GCN-NEXT:    buffer_load_dwordx4 v[2:5], v[0:1], s[8:11], 0 addr64 offset:112 glc
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_load_dwordx4 v[2:5], v[0:1], s[8:11], 0 addr64 offset:96 glc
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_load_dwordx4 v[2:5], v[0:1], s[8:11], 0 addr64 offset:80 glc
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_load_dwordx4 v[2:5], v[0:1], s[8:11], 0 addr64 offset:64 glc
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[8:11], 0 addr64 glc
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[8:11], 0 addr64 offset:16 glc
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_load_dwordx4 v[12:15], v[0:1], s[8:11], 0 addr64 offset:32 glc
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_load_dwordx4 v[16:19], v[0:1], s[8:11], 0 addr64 offset:48 glc
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:  .LBB6_4: ; %exit
-; GCN-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GCN-NEXT:    v_mov_b32_e32 v0, 0xbff00000
-; GCN-NEXT:    v_cmp_nlt_f64_e32 vcc, -1.0, v[6:7]
-; GCN-NEXT:    v_cmp_nlt_f64_e64 s[4:5], -1.0, v[8:9]
-; GCN-NEXT:    v_cmp_nlt_f64_e64 s[6:7], -1.0, v[10:11]
-; GCN-NEXT:    v_cmp_nlt_f64_e64 s[8:9], -1.0, v[12:13]
-; GCN-NEXT:    v_cmp_nlt_f64_e64 s[10:11], -1.0, v[14:15]
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_cmp_nlt_f64_e64 s[12:13], -1.0, v[16:17]
-; GCN-NEXT:    v_cmp_nlt_f64_e64 s[14:15], -1.0, v[18:19]
-; GCN-NEXT:    v_cmp_nlt_f64_e64 s[16:17], -1.0, v[4:5]
-; GCN-NEXT:    v_cndmask_b32_e64 v1, -2.0, v0, s[16:17]
-; GCN-NEXT:    v_cndmask_b32_e32 v3, -2.0, v0, vcc
-; GCN-NEXT:    v_cndmask_b32_e64 v5, -2.0, v0, s[4:5]
-; GCN-NEXT:    v_cndmask_b32_e64 v7, -2.0, v0, s[6:7]
-; GCN-NEXT:    v_cndmask_b32_e64 v9, -2.0, v0, s[8:9]
-; GCN-NEXT:    v_cndmask_b32_e64 v11, -2.0, v0, s[10:11]
-; GCN-NEXT:    v_cndmask_b32_e64 v13, -2.0, v0, s[12:13]
-; GCN-NEXT:    v_cndmask_b32_e64 v15, -2.0, v0, s[14:15]
-; GCN-NEXT:    v_mov_b32_e32 v0, 0
-; GCN-NEXT:    v_mov_b32_e32 v2, 0
-; GCN-NEXT:    v_mov_b32_e32 v4, 0
-; GCN-NEXT:    v_mov_b32_e32 v6, 0
-; GCN-NEXT:    v_mov_b32_e32 v8, 0
-; GCN-NEXT:    v_mov_b32_e32 v10, 0
-; GCN-NEXT:    v_mov_b32_e32 v12, 0
-; GCN-NEXT:    v_mov_b32_e32 v14, 0
-; GCN-NEXT:    s_setpc_b64 s[30:31]
   br i1 %c0, label %T, label %F
 
 T:

diff  --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
index 952e89edeb799..08cf83fd2bd0f 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
@@ -255,13 +255,13 @@
 ; GCN-O1-NEXT:        Function Alias Analysis Results
 ; GCN-O1-NEXT:        Flatten the CFG
 ; GCN-O1-NEXT:        Dominator Tree Construction
+; GCN-O1-NEXT:        Cycle Info Analysis
+; GCN-O1-NEXT:        Uniformity Analysis
+; GCN-O1-NEXT:        AMDGPU IR late optimizations
 ; GCN-O1-NEXT:        Basic Alias Analysis (stateless AA impl)
 ; GCN-O1-NEXT:        Function Alias Analysis Results
 ; GCN-O1-NEXT:        Natural Loop Information
 ; GCN-O1-NEXT:        Code sinking
-; GCN-O1-NEXT:        Cycle Info Analysis
-; GCN-O1-NEXT:        Uniformity Analysis
-; GCN-O1-NEXT:        AMDGPU IR late optimizations
 ; GCN-O1-NEXT:        Post-Dominator Tree Construction
 ; GCN-O1-NEXT:        Unify divergent function exit nodes
 ; GCN-O1-NEXT:        Dominator Tree Construction
@@ -552,13 +552,13 @@
 ; GCN-O1-OPTS-NEXT:        Function Alias Analysis Results
 ; GCN-O1-OPTS-NEXT:        Flatten the CFG
 ; GCN-O1-OPTS-NEXT:        Dominator Tree Construction
+; GCN-O1-OPTS-NEXT:        Cycle Info Analysis
+; GCN-O1-OPTS-NEXT:        Uniformity Analysis
+; GCN-O1-OPTS-NEXT:        AMDGPU IR late optimizations
 ; GCN-O1-OPTS-NEXT:        Basic Alias Analysis (stateless AA impl)
 ; GCN-O1-OPTS-NEXT:        Function Alias Analysis Results
 ; GCN-O1-OPTS-NEXT:        Natural Loop Information
 ; GCN-O1-OPTS-NEXT:        Code sinking
-; GCN-O1-OPTS-NEXT:        Cycle Info Analysis
-; GCN-O1-OPTS-NEXT:        Uniformity Analysis
-; GCN-O1-OPTS-NEXT:        AMDGPU IR late optimizations
 ; GCN-O1-OPTS-NEXT:        Post-Dominator Tree Construction
 ; GCN-O1-OPTS-NEXT:        Unify divergent function exit nodes
 ; GCN-O1-OPTS-NEXT:        Dominator Tree Construction
@@ -861,13 +861,13 @@
 ; GCN-O2-NEXT:        Function Alias Analysis Results
 ; GCN-O2-NEXT:        Flatten the CFG
 ; GCN-O2-NEXT:        Dominator Tree Construction
+; GCN-O2-NEXT:        Cycle Info Analysis
+; GCN-O2-NEXT:        Uniformity Analysis
+; GCN-O2-NEXT:        AMDGPU IR late optimizations
 ; GCN-O2-NEXT:        Basic Alias Analysis (stateless AA impl)
 ; GCN-O2-NEXT:        Function Alias Analysis Results
 ; GCN-O2-NEXT:        Natural Loop Information
 ; GCN-O2-NEXT:        Code sinking
-; GCN-O2-NEXT:        Cycle Info Analysis
-; GCN-O2-NEXT:        Uniformity Analysis
-; GCN-O2-NEXT:        AMDGPU IR late optimizations
 ; GCN-O2-NEXT:        Post-Dominator Tree Construction
 ; GCN-O2-NEXT:        Unify divergent function exit nodes
 ; GCN-O2-NEXT:        Dominator Tree Construction
@@ -1184,13 +1184,13 @@
 ; GCN-O3-NEXT:        Function Alias Analysis Results
 ; GCN-O3-NEXT:        Flatten the CFG
 ; GCN-O3-NEXT:        Dominator Tree Construction
+; GCN-O3-NEXT:        Cycle Info Analysis
+; GCN-O3-NEXT:        Uniformity Analysis
+; GCN-O3-NEXT:        AMDGPU IR late optimizations
 ; GCN-O3-NEXT:        Basic Alias Analysis (stateless AA impl)
 ; GCN-O3-NEXT:        Function Alias Analysis Results
 ; GCN-O3-NEXT:        Natural Loop Information
 ; GCN-O3-NEXT:        Code sinking
-; GCN-O3-NEXT:        Cycle Info Analysis
-; GCN-O3-NEXT:        Uniformity Analysis
-; GCN-O3-NEXT:        AMDGPU IR late optimizations
 ; GCN-O3-NEXT:        Post-Dominator Tree Construction
 ; GCN-O3-NEXT:        Unify divergent function exit nodes
 ; GCN-O3-NEXT:        Dominator Tree Construction

diff  --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll
index 911bb44078d51..0f2eedb1923d6 100644
--- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll
@@ -2101,7 +2101,10 @@ define void @crash_lshlrevb16_not_reg_op() {
 ; NOSDWA:       ; %bb.0: ; %bb0
 ; NOSDWA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; NOSDWA-NEXT:    s_mov_b64 s[4:5], 0
-; NOSDWA-NEXT:    v_mov_b32_e32 v0, 0x100
+; NOSDWA-NEXT:    v_mov_b32_e32 v0, 0xff
+; NOSDWA-NEXT:    v_and_b32_e32 v0, s4, v0
+; NOSDWA-NEXT:    v_lshlrev_b16_e64 v1, 8, 1
+; NOSDWA-NEXT:    v_or_b32_e32 v0, v0, v1
 ; NOSDWA-NEXT:    s_and_b64 vcc, exec, -1
 ; NOSDWA-NEXT:  .LBB22_1: ; %bb1
 ; NOSDWA-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -2121,7 +2124,9 @@ define void @crash_lshlrevb16_not_reg_op() {
 ; GFX89:       ; %bb.0: ; %bb0
 ; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    s_mov_b64 s[4:5], 0
-; GFX89-NEXT:    v_mov_b32_e32 v0, 0x100
+; GFX89-NEXT:    v_lshlrev_b16_e64 v0, 8, 1
+; GFX89-NEXT:    v_mov_b32_e32 v1, s4
+; GFX89-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX89-NEXT:    s_and_b64 vcc, exec, -1
 ; GFX89-NEXT:  .LBB22_1: ; %bb1
 ; GFX89-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -2141,7 +2146,8 @@ define void @crash_lshlrevb16_not_reg_op() {
 ; GFX9:       ; %bb.0: ; %bb0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b64 s[4:5], 0
-; GFX9-NEXT:    v_mov_b32_e32 v0, 0x100
+; GFX9-NEXT:    v_lshlrev_b16_e64 v0, 8, 1
+; GFX9-NEXT:    v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    s_and_b64 vcc, exec, -1
 ; GFX9-NEXT:  .LBB22_1: ; %bb1
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -2160,16 +2166,18 @@ define void @crash_lshlrevb16_not_reg_op() {
 ; GFX10-LABEL: crash_lshlrevb16_not_reg_op:
 ; GFX10:       ; %bb.0: ; %bb0
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    s_mov_b64 s[4:5], 0
+; GFX10-NEXT:    v_lshlrev_b16 v0, 8, 1
 ; GFX10-NEXT:    s_mov_b32 vcc_lo, exec_lo
+; GFX10-NEXT:    v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX10-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX10-NEXT:  .LBB22_1: ; %bb1
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_lshl_b32 s6, s4, 3
-; GFX10-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-NEXT:    v_lshrrev_b16 v2, s6, 0x100
+; GFX10-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-NEXT:    v_mov_b32_e32 v2, s5
+; GFX10-NEXT:    v_lshrrev_b16 v3, s6, v0
 ; GFX10-NEXT:    s_mov_b64 s[4:5], 1
-; GFX10-NEXT:    flat_store_byte v[0:1], v2
+; GFX10-NEXT:    flat_store_byte v[1:2], v3
 ; GFX10-NEXT:    s_cbranch_vccnz .LBB22_1
 ; GFX10-NEXT:  ; %bb.2: ; %DummyReturnBlock
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)

diff  --git a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
index 2355fa7870ea8..f78b408d78255 100644
--- a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
+++ b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx906 < %s | FileCheck --check-prefix=GFX906 %s
 
 define amdgpu_kernel void @v3i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
@@ -6,31 +6,27 @@ define amdgpu_kernel void @v3i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1)
 ; GFX906:       ; %bb.0: ; %entry
 ; GFX906-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX906-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX906-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; GFX906-NEXT:    v_mov_b32_e32 v3, 8
+; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 2, v0
 ; GFX906-NEXT:    v_mov_b32_e32 v1, 0
-; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX906-NEXT:    global_load_dword v4, v2, s[4:5]
-; GFX906-NEXT:    s_mov_b32 s4, 0xff0000
 ; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v0
+; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX906-NEXT:    global_load_dword v2, v5, s[4:5]
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshrrev_b32_sdwa v5, v3, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v5, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX906-NEXT:    v_and_or_b32 v4, v4, s4, v5
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 8, v2
 ; GFX906-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX906-NEXT:    s_cbranch_execz .LBB0_2
 ; GFX906-NEXT:  ; %bb.1: ; %bb.1
-; GFX906-NEXT:    global_load_dword v0, v2, s[6:7]
+; GFX906-NEXT:    global_load_dword v2, v5, s[6:7]
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshrrev_b32_sdwa v2, v3, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX906-NEXT:    v_and_or_b32 v4, v0, s4, v2
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 8, v2
 ; GFX906-NEXT:  .LBB0_2: ; %bb.2
 ; GFX906-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX906-NEXT:    global_store_byte_d16_hi v1, v4, s[2:3] offset:2
-; GFX906-NEXT:    global_store_short v1, v4, s[2:3]
+; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v4
+; GFX906-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    global_store_byte v1, v3, s[2:3] offset:2
+; GFX906-NEXT:    global_store_short v1, v0, s[2:3]
 ; GFX906-NEXT:    s_endpgm
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -54,19 +50,31 @@ define amdgpu_kernel void @v4i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1)
 ; GFX906:       ; %bb.0: ; %entry
 ; GFX906-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX906-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX906-NEXT:    v_lshlrev_b32_e32 v3, 2, v0
+; GFX906-NEXT:    v_lshlrev_b32_e32 v6, 2, v0
 ; GFX906-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v0
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX906-NEXT:    global_load_dword v2, v3, s[4:5]
+; GFX906-NEXT:    global_load_dword v2, v6, s[4:5]
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v2
+; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; GFX906-NEXT:    v_lshrrev_b32_e32 v5, 8, v2
 ; GFX906-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX906-NEXT:    s_cbranch_execz .LBB1_2
 ; GFX906-NEXT:  ; %bb.1: ; %bb.1
-; GFX906-NEXT:    global_load_dword v2, v3, s[6:7]
+; GFX906-NEXT:    global_load_dword v2, v6, s[6:7]
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v2
+; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; GFX906-NEXT:    v_lshrrev_b32_e32 v5, 8, v2
 ; GFX906-NEXT:  .LBB1_2: ; %bb.2
 ; GFX906-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    global_store_dword v1, v2, s[2:3]
+; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v5
+; GFX906-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v3
+; GFX906-NEXT:    v_or_b32_sdwa v2, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    global_store_dword v1, v0, s[2:3]
 ; GFX906-NEXT:    s_endpgm
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -90,23 +98,32 @@ define amdgpu_kernel void @v5i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1)
 ; GFX906:       ; %bb.0: ; %entry
 ; GFX906-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX906-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX906-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
-; GFX906-NEXT:    v_mov_b32_e32 v3, 0
+; GFX906-NEXT:    v_lshlrev_b32_e32 v7, 3, v0
+; GFX906-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v0
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX906-NEXT:    global_load_dwordx2 v[1:2], v4, s[4:5]
+; GFX906-NEXT:    global_load_dwordx2 v[1:2], v7, s[4:5]
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX906-NEXT:    v_lshrrev_b64 v[3:4], 24, v[1:2]
+; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
+; GFX906-NEXT:    v_lshrrev_b32_e32 v6, 8, v1
 ; GFX906-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX906-NEXT:    s_cbranch_execz .LBB2_2
 ; GFX906-NEXT:  ; %bb.1: ; %bb.1
-; GFX906-NEXT:    global_load_dwordx2 v[1:2], v4, s[6:7]
+; GFX906-NEXT:    global_load_dwordx2 v[1:2], v7, s[6:7]
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX906-NEXT:    v_lshrrev_b64 v[3:4], 24, v[1:2]
+; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
+; GFX906-NEXT:    v_lshrrev_b32_e32 v6, 8, v1
 ; GFX906-NEXT:  .LBB2_2: ; %bb.2
 ; GFX906-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX906-NEXT:    global_store_byte v3, v2, s[2:3] offset:4
-; GFX906-NEXT:    global_store_dword v3, v1, s[2:3]
+; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v6
+; GFX906-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v3
+; GFX906-NEXT:    v_or_b32_sdwa v1, v4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    global_store_byte v5, v2, s[2:3] offset:4
+; GFX906-NEXT:    global_store_dword v5, v0, s[2:3]
 ; GFX906-NEXT:    s_endpgm
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -130,19 +147,42 @@ define amdgpu_kernel void @v8i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1)
 ; GFX906:       ; %bb.0: ; %entry
 ; GFX906-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX906-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX906-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
+; GFX906-NEXT:    v_lshlrev_b32_e32 v10, 3, v0
 ; GFX906-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v0
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX906-NEXT:    global_load_dwordx2 v[1:2], v4, s[4:5]
+; GFX906-NEXT:    global_load_dwordx2 v[1:2], v10, s[4:5]
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 24, v2
+; GFX906-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
+; GFX906-NEXT:    v_lshrrev_b32_e32 v6, 8, v2
+; GFX906-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
+; GFX906-NEXT:    v_lshrrev_b32_e32 v8, 16, v1
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v1
 ; GFX906-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX906-NEXT:    s_cbranch_execz .LBB3_2
 ; GFX906-NEXT:  ; %bb.1: ; %bb.1
-; GFX906-NEXT:    global_load_dwordx2 v[1:2], v4, s[6:7]
+; GFX906-NEXT:    global_load_dwordx2 v[1:2], v10, s[6:7]
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 24, v2
+; GFX906-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
+; GFX906-NEXT:    v_lshrrev_b32_e32 v6, 8, v2
+; GFX906-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
+; GFX906-NEXT:    v_lshrrev_b32_e32 v8, 16, v1
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v1
 ; GFX906-NEXT:  .LBB3_2: ; %bb.2
 ; GFX906-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    global_store_dwordx2 v3, v[1:2], s[2:3]
+; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v9
+; GFX906-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v7
+; GFX906-NEXT:    v_or_b32_sdwa v1, v8, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v6
+; GFX906-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v4
+; GFX906-NEXT:    v_or_b32_sdwa v2, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    global_store_dwordx2 v3, v[0:1], s[2:3]
 ; GFX906-NEXT:    s_endpgm
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -166,19 +206,64 @@ define amdgpu_kernel void @v16i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1
 ; GFX906:       ; %bb.0: ; %entry
 ; GFX906-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX906-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX906-NEXT:    v_lshlrev_b32_e32 v6, 4, v0
+; GFX906-NEXT:    v_lshlrev_b32_e32 v18, 4, v0
 ; GFX906-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v0
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX906-NEXT:    global_load_dwordx4 v[1:4], v6, s[4:5]
+; GFX906-NEXT:    global_load_dwordx4 v[1:4], v18, s[4:5]
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v6, 24, v4
+; GFX906-NEXT:    v_lshrrev_b32_e32 v7, 16, v4
+; GFX906-NEXT:    v_lshrrev_b32_e32 v8, 8, v4
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v3
+; GFX906-NEXT:    v_lshrrev_b32_e32 v10, 16, v3
+; GFX906-NEXT:    v_lshrrev_b32_e32 v11, 8, v3
+; GFX906-NEXT:    v_lshrrev_b32_e32 v12, 24, v2
+; GFX906-NEXT:    v_lshrrev_b32_e32 v13, 16, v2
+; GFX906-NEXT:    v_lshrrev_b32_e32 v14, 8, v2
+; GFX906-NEXT:    v_lshrrev_b32_e32 v15, 24, v1
+; GFX906-NEXT:    v_lshrrev_b32_e32 v16, 16, v1
+; GFX906-NEXT:    v_lshrrev_b32_e32 v17, 8, v1
 ; GFX906-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX906-NEXT:    s_cbranch_execz .LBB4_2
 ; GFX906-NEXT:  ; %bb.1: ; %bb.1
-; GFX906-NEXT:    global_load_dwordx4 v[1:4], v6, s[6:7]
+; GFX906-NEXT:    global_load_dwordx4 v[1:4], v18, s[6:7]
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v6, 24, v4
+; GFX906-NEXT:    v_lshrrev_b32_e32 v7, 16, v4
+; GFX906-NEXT:    v_lshrrev_b32_e32 v8, 8, v4
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v3
+; GFX906-NEXT:    v_lshrrev_b32_e32 v10, 16, v3
+; GFX906-NEXT:    v_lshrrev_b32_e32 v11, 8, v3
+; GFX906-NEXT:    v_lshrrev_b32_e32 v12, 24, v2
+; GFX906-NEXT:    v_lshrrev_b32_e32 v13, 16, v2
+; GFX906-NEXT:    v_lshrrev_b32_e32 v14, 8, v2
+; GFX906-NEXT:    v_lshrrev_b32_e32 v15, 24, v1
+; GFX906-NEXT:    v_lshrrev_b32_e32 v16, 16, v1
+; GFX906-NEXT:    v_lshrrev_b32_e32 v17, 8, v1
 ; GFX906-NEXT:  .LBB4_2: ; %bb.2
 ; GFX906-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    global_store_dwordx4 v5, v[1:4], s[2:3]
+; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v17
+; GFX906-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v15
+; GFX906-NEXT:    v_or_b32_sdwa v1, v16, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v14
+; GFX906-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v12
+; GFX906-NEXT:    v_or_b32_sdwa v2, v13, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v11
+; GFX906-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v9
+; GFX906-NEXT:    v_or_b32_sdwa v3, v10, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v8
+; GFX906-NEXT:    v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_lshlrev_b16_e32 v4, 8, v6
+; GFX906-NEXT:    v_or_b32_sdwa v4, v7, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    global_store_dwordx4 v5, v[0:3], s[2:3]
 ; GFX906-NEXT:    s_endpgm
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -201,24 +286,114 @@ define amdgpu_kernel void @v32i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1
 ; GFX906-LABEL: v32i8_liveout:
 ; GFX906:       ; %bb.0: ; %entry
 ; GFX906-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX906-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX906-NEXT:    v_lshlrev_b32_e32 v10, 5, v0
+; GFX906-NEXT:    v_lshlrev_b32_e32 v31, 5, v0
+; GFX906-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX906-NEXT:    v_mov_b32_e32 v9, 0
 ; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v0
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX906-NEXT:    global_load_dwordx4 v[1:4], v10, s[4:5] offset:16
-; GFX906-NEXT:    global_load_dwordx4 v[5:8], v10, s[4:5]
-; GFX906-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX906-NEXT:    global_load_dwordx4 v[1:4], v31, s[4:5] offset:16
+; GFX906-NEXT:    global_load_dwordx4 v[5:8], v31, s[4:5]
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v4
+; GFX906-NEXT:    v_lshrrev_b32_e32 v10, 16, v4
+; GFX906-NEXT:    v_lshrrev_b32_e32 v11, 8, v4
+; GFX906-NEXT:    v_lshrrev_b32_e32 v12, 24, v3
+; GFX906-NEXT:    v_lshrrev_b32_e32 v13, 16, v3
+; GFX906-NEXT:    v_lshrrev_b32_e32 v14, 8, v3
+; GFX906-NEXT:    v_lshrrev_b32_e32 v15, 24, v2
+; GFX906-NEXT:    v_lshrrev_b32_e32 v16, 16, v2
+; GFX906-NEXT:    v_lshrrev_b32_e32 v17, 8, v2
+; GFX906-NEXT:    v_lshrrev_b32_e32 v18, 24, v1
+; GFX906-NEXT:    v_lshrrev_b32_e32 v19, 16, v1
+; GFX906-NEXT:    v_lshrrev_b32_e32 v20, 8, v1
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v21, 24, v8
+; GFX906-NEXT:    v_lshrrev_b32_e32 v22, 16, v8
+; GFX906-NEXT:    v_lshrrev_b32_e32 v23, 8, v8
+; GFX906-NEXT:    v_lshrrev_b32_e32 v24, 24, v7
+; GFX906-NEXT:    v_lshrrev_b32_e32 v25, 16, v7
+; GFX906-NEXT:    v_lshrrev_b32_e32 v26, 8, v7
+; GFX906-NEXT:    v_lshrrev_b32_e32 v27, 24, v6
+; GFX906-NEXT:    v_lshrrev_b32_e32 v28, 16, v6
+; GFX906-NEXT:    v_lshrrev_b32_e32 v29, 8, v6
+; GFX906-NEXT:    v_lshrrev_b32_e32 v30, 24, v5
+; GFX906-NEXT:    v_lshrrev_b32_e32 v32, 16, v5
+; GFX906-NEXT:    v_lshrrev_b32_e32 v33, 8, v5
+; GFX906-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX906-NEXT:    s_cbranch_execz .LBB5_2
 ; GFX906-NEXT:  ; %bb.1: ; %bb.1
-; GFX906-NEXT:    global_load_dwordx4 v[1:4], v10, s[6:7] offset:16
-; GFX906-NEXT:    global_load_dwordx4 v[5:8], v10, s[6:7]
-; GFX906-NEXT:  .LBB5_2: ; %bb.2
-; GFX906-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    global_store_dwordx4 v9, v[1:4], s[2:3] offset:16
+; GFX906-NEXT:    global_load_dwordx4 v[1:4], v31, s[6:7] offset:16
+; GFX906-NEXT:    global_load_dwordx4 v[5:8], v31, s[6:7]
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    global_store_dwordx4 v9, v[5:8], s[2:3]
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v4
+; GFX906-NEXT:    v_lshrrev_b32_e32 v10, 16, v4
+; GFX906-NEXT:    v_lshrrev_b32_e32 v11, 8, v4
+; GFX906-NEXT:    v_lshrrev_b32_e32 v12, 24, v3
+; GFX906-NEXT:    v_lshrrev_b32_e32 v13, 16, v3
+; GFX906-NEXT:    v_lshrrev_b32_e32 v14, 8, v3
+; GFX906-NEXT:    v_lshrrev_b32_e32 v15, 24, v2
+; GFX906-NEXT:    v_lshrrev_b32_e32 v16, 16, v2
+; GFX906-NEXT:    v_lshrrev_b32_e32 v17, 8, v2
+; GFX906-NEXT:    v_lshrrev_b32_e32 v18, 24, v1
+; GFX906-NEXT:    v_lshrrev_b32_e32 v19, 16, v1
+; GFX906-NEXT:    v_lshrrev_b32_e32 v20, 8, v1
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v21, 24, v8
+; GFX906-NEXT:    v_lshrrev_b32_e32 v22, 16, v8
+; GFX906-NEXT:    v_lshrrev_b32_e32 v23, 8, v8
+; GFX906-NEXT:    v_lshrrev_b32_e32 v24, 24, v7
+; GFX906-NEXT:    v_lshrrev_b32_e32 v25, 16, v7
+; GFX906-NEXT:    v_lshrrev_b32_e32 v26, 8, v7
+; GFX906-NEXT:    v_lshrrev_b32_e32 v27, 24, v6
+; GFX906-NEXT:    v_lshrrev_b32_e32 v28, 16, v6
+; GFX906-NEXT:    v_lshrrev_b32_e32 v29, 8, v6
+; GFX906-NEXT:    v_lshrrev_b32_e32 v30, 24, v5
+; GFX906-NEXT:    v_lshrrev_b32_e32 v32, 16, v5
+; GFX906-NEXT:    v_lshrrev_b32_e32 v33, 8, v5
+; GFX906-NEXT:  .LBB5_2: ; %bb.2
+; GFX906-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX906-NEXT:    v_lshlrev_b16_e32 v30, 8, v30
+; GFX906-NEXT:    v_lshlrev_b16_e32 v31, 8, v33
+; GFX906-NEXT:    v_lshlrev_b16_e32 v29, 8, v29
+; GFX906-NEXT:    v_lshlrev_b16_e32 v27, 8, v27
+; GFX906-NEXT:    v_lshlrev_b16_e32 v26, 8, v26
+; GFX906-NEXT:    v_lshlrev_b16_e32 v24, 8, v24
+; GFX906-NEXT:    v_lshlrev_b16_e32 v23, 8, v23
+; GFX906-NEXT:    v_lshlrev_b16_e32 v21, 8, v21
+; GFX906-NEXT:    v_or_b32_sdwa v30, v32, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v5, v5, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v6, v6, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v27, v28, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v7, v7, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v24, v25, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v8, v8, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v21, v22, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v5, v5, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v6, v6, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v7, v7, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v8, v8, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    global_store_dwordx4 v9, v[5:8], s[0:1]
+; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
+; GFX906-NEXT:    v_lshlrev_b16_e32 v5, 8, v20
+; GFX906-NEXT:    v_or_b32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_lshlrev_b16_e32 v5, 8, v18
+; GFX906-NEXT:    v_or_b32_sdwa v5, v19, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    v_lshlrev_b16_e32 v5, 8, v17
+; GFX906-NEXT:    v_or_b32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_lshlrev_b16_e32 v5, 8, v15
+; GFX906-NEXT:    v_or_b32_sdwa v5, v16, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    v_lshlrev_b16_e32 v5, 8, v14
+; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_lshlrev_b16_e32 v5, 8, v12
+; GFX906-NEXT:    v_or_b32_sdwa v5, v13, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    v_lshlrev_b16_e32 v5, 8, v11
+; GFX906-NEXT:    v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v0, v10, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v4, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    global_store_dwordx4 v9, v[1:4], s[0:1] offset:16
 ; GFX906-NEXT:    s_endpgm
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -240,595 +415,1572 @@ bb.2:
 define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
 ; GFX906-LABEL: v256i8_liveout:
 ; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX906-NEXT:    v_lshlrev_b32_e32 v61, 3, v0
 ; GFX906-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
 ; GFX906-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
 ; GFX906-NEXT:    s_mov_b32 s10, -1
-; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX906-NEXT:    global_load_dwordx4 v[5:8], v61, s[4:5] offset:240
 ; GFX906-NEXT:    s_mov_b32 s11, 0xe00000
 ; GFX906-NEXT:    s_add_u32 s8, s8, s3
+; GFX906-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX906-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX906-NEXT:    v_lshlrev_b32_e32 v63, 3, v0
 ; GFX906-NEXT:    s_addc_u32 s9, s9, 0
+; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX906-NEXT:    global_load_dwordx4 v[17:20], v63, s[4:5] offset:240
+; GFX906-NEXT:    global_load_dwordx4 v[5:8], v63, s[4:5] offset:224
+; GFX906-NEXT:    global_load_dwordx4 v[9:12], v63, s[4:5] offset:208
+; GFX906-NEXT:    global_load_dwordx4 v[13:16], v63, s[4:5] offset:192
 ; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v0
-; GFX906-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX906-NEXT:    v_mov_b32_e32 v4, 0
+; GFX906-NEXT:    s_waitcnt vmcnt(3)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v20
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:16 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v20
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:20 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v20
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:24 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v19
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:28 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v19
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:32 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v19
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:36 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v18
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:40 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v18
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:44 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v18
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:48 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v17
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:52 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v17
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:56 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v17, off, s[8:11], 0 ; 4-byte Folded Spill
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    buffer_store_dword v5, off, s[8:11], 0 ; 4-byte Folded Spill
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    buffer_store_dword v6, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v7, off, s[8:11], 0 offset:8 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v8, off, s[8:11], 0 offset:12 ; 4-byte Folded Spill
-; GFX906-NEXT:    global_load_dwordx4 v[5:8], v61, s[4:5] offset:224
+; GFX906-NEXT:    buffer_store_dword v18, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v19, off, s[8:11], 0 offset:8 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v20, off, s[8:11], 0 offset:12 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v17
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:60 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v8
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:64 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v8
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:68 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v8
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:72 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v7
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:76 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v7
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:80 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v7
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:84 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v6
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:88 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v6
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:92 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v6
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:96 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v5
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:100 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v5
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:104 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v5
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:108 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v12
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:112 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v12
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:116 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v12
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:120 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v11
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:124 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v11
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:128 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v11
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:132 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v10
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:136 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v10
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:140 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v10
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:144 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v9
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:148 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v9
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:152 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v9
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:156 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v16
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:160 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v16
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:164 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v16
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:168 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v15
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:176 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v15
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:180 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v15
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:172 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v14
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:188 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v14
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:192 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v14
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:184 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v13
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:200 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v13
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:204 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v13
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:196 ; 4-byte Folded Spill
+; GFX906-NEXT:    global_load_dwordx4 v[17:20], v63, s[4:5] offset:176
+; GFX906-NEXT:    global_load_dwordx4 v[21:24], v63, s[4:5] offset:160
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v20
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:208 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v20
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:212 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v20
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:224 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v19
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:216 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v19
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:220 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v19
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:236 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v18
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:228 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v18
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:232 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v18
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:248 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v17
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:240 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v17
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:244 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v17
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:252 ; 4-byte Folded Spill
+; GFX906-NEXT:    s_waitcnt vmcnt(12)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v24
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:256 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v24
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:260 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v24
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:272 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v23
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:264 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v23
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:268 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v23
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:284 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v22
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:276 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v22
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:280 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v22
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:296 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v21
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:288 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v21
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:292 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v21
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:300 ; 4-byte Folded Spill
+; GFX906-NEXT:    global_load_dwordx4 v[25:28], v63, s[4:5] offset:144
+; GFX906-NEXT:    global_load_dwordx4 v[29:32], v63, s[4:5] offset:128
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v28
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:304 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v28
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:308 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v28
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:320 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v27
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:312 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v27
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:316 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v27
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:332 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v26
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:324 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v26
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:328 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v26
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:344 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v25
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:336 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v25
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:340 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v25
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:348 ; 4-byte Folded Spill
+; GFX906-NEXT:    s_waitcnt vmcnt(12)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v32
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:352 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v32
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:356 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v32
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:368 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v31
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:360 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v31
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:364 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v31
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:380 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v30
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:372 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v30
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:376 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v30
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:392 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v29
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:384 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v29
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:388 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v29
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:396 ; 4-byte Folded Spill
+; GFX906-NEXT:    global_load_dwordx4 v[33:36], v63, s[4:5] offset:112
+; GFX906-NEXT:    global_load_dwordx4 v[37:40], v63, s[4:5] offset:96
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v36
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:400 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v36
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:404 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v36
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:416 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v35
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:408 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v35
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:412 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v35
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:428 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v34
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:420 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v34
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:424 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v34
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:440 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v33
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:432 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v33
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:436 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v33
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:444 ; 4-byte Folded Spill
+; GFX906-NEXT:    s_waitcnt vmcnt(12)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v40
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:448 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v40
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:452 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v40
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:464 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v39
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:456 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v39
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:460 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v39
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:476 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v38
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:468 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v38
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:472 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v38
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:488 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v37
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:480 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v37
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:484 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v37
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:492 ; 4-byte Folded Spill
+; GFX906-NEXT:    global_load_dwordx4 v[41:44], v63, s[4:5] offset:80
+; GFX906-NEXT:    global_load_dwordx4 v[45:48], v63, s[4:5] offset:64
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v44
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:496 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v44
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:500 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v44
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:512 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v43
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:504 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v43
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:508 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v43
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:524 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v42
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:516 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v42
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:520 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v42
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:536 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v41
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:528 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v41
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:532 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v41
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:540 ; 4-byte Folded Spill
+; GFX906-NEXT:    s_waitcnt vmcnt(12)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v48
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:544 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v48
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:548 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v48
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:560 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v47
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:552 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v47
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:556 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v47
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:572 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v46
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:564 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v46
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:568 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v46
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:584 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v45
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:576 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v45
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:580 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v45
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:588 ; 4-byte Folded Spill
+; GFX906-NEXT:    global_load_dwordx4 v[49:52], v63, s[4:5] offset:48
+; GFX906-NEXT:    global_load_dwordx4 v[53:56], v63, s[4:5] offset:32
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v52
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:592 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v52
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:596 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v52
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:608 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v51
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:600 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v51
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:604 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v51
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:620 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v50
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:612 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v50
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:616 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v50
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:632 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v49
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:624 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v49
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:628 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v49
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:636 ; 4-byte Folded Spill
+; GFX906-NEXT:    s_waitcnt vmcnt(12)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v56
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:640 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v56
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:644 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v56
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:656 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v55
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:648 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v55
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:652 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v55
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:668 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v54
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:660 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v54
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:664 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v54
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:680 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v53
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:672 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v53
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:676 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v53
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:684 ; 4-byte Folded Spill
+; GFX906-NEXT:    global_load_dwordx4 v[57:60], v63, s[4:5] offset:16
 ; GFX906-NEXT:    s_nop 0
-; GFX906-NEXT:    global_load_dwordx4 v[9:12], v61, s[4:5] offset:208
-; GFX906-NEXT:    global_load_dwordx4 v[13:16], v61, s[4:5] offset:192
-; GFX906-NEXT:    global_load_dwordx4 v[17:20], v61, s[4:5] offset:176
-; GFX906-NEXT:    global_load_dwordx4 v[21:24], v61, s[4:5] offset:160
-; GFX906-NEXT:    global_load_dwordx4 v[25:28], v61, s[4:5] offset:144
-; GFX906-NEXT:    global_load_dwordx4 v[29:32], v61, s[4:5] offset:128
-; GFX906-NEXT:    global_load_dwordx4 v[33:36], v61, s[4:5] offset:112
-; GFX906-NEXT:    global_load_dwordx4 v[37:40], v61, s[4:5] offset:96
-; GFX906-NEXT:    global_load_dwordx4 v[41:44], v61, s[4:5] offset:80
-; GFX906-NEXT:    global_load_dwordx4 v[45:48], v61, s[4:5] offset:64
-; GFX906-NEXT:    global_load_dwordx4 v[49:52], v61, s[4:5] offset:48
-; GFX906-NEXT:    global_load_dwordx4 v[53:56], v61, s[4:5] offset:32
-; GFX906-NEXT:    global_load_dwordx4 v[57:60], v61, s[4:5] offset:16
-; GFX906-NEXT:    global_load_dwordx4 v[0:3], v61, s[4:5]
-; GFX906-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX906-NEXT:    global_load_dwordx4 v[0:3], v63, s[4:5]
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 24, v60
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:688 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 16, v60
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:692 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 8, v60
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:704 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 24, v59
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:696 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 16, v59
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:700 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 8, v59
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:716 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 24, v58
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:708 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 16, v58
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:712 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 8, v58
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:728 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 24, v57
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:720 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 16, v57
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:724 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 8, v57
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:732 ; 4-byte Folded Spill
+; GFX906-NEXT:    s_waitcnt vmcnt(12)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 24, v3
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:736 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 16, v3
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:740 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 8, v3
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:752 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 24, v2
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:744 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 16, v2
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:748 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 8, v2
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:764 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 24, v1
+; GFX906-NEXT:    v_lshrrev_b32_e32 v62, 24, v0
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:756 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 16, v1
+; GFX906-NEXT:    buffer_store_dword v62, off, s[8:11], 0 offset:768 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v62, 16, v0
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:760 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 8, v1
+; GFX906-NEXT:    buffer_store_dword v62, off, s[8:11], 0 offset:772 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v62, 8, v0
+; GFX906-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX906-NEXT:    s_cbranch_execz .LBB6_2
 ; GFX906-NEXT:  ; %bb.1: ; %bb.1
-; GFX906-NEXT:    global_load_dwordx4 v[0:3], v61, s[6:7] offset:240
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    global_load_dwordx4 v[0:3], v63, s[6:7] offset:240
+; GFX906-NEXT:    global_load_dwordx4 v[5:8], v63, s[6:7] offset:224
+; GFX906-NEXT:    global_load_dwordx4 v[9:12], v63, s[6:7] offset:208
+; GFX906-NEXT:    global_load_dwordx4 v[13:16], v63, s[6:7] offset:192
+; GFX906-NEXT:    s_waitcnt vmcnt(3)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v17, 24, v3
+; GFX906-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:16 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v17, 16, v3
+; GFX906-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:20 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v17, 8, v3
+; GFX906-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:24 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v17, 24, v2
+; GFX906-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:28 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v17, 16, v2
+; GFX906-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:32 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v17, 8, v2
+; GFX906-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:36 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v17, 24, v1
+; GFX906-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:40 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v17, 16, v1
+; GFX906-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:44 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v17, 8, v1
+; GFX906-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:48 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v17, 24, v0
+; GFX906-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:52 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v17, 16, v0
+; GFX906-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:56 ; 4-byte Folded Spill
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 ; 4-byte Folded Spill
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    buffer_store_dword v1, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill
 ; GFX906-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:8 ; 4-byte Folded Spill
 ; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:12 ; 4-byte Folded Spill
-; GFX906-NEXT:    global_load_dwordx4 v[5:8], v61, s[6:7] offset:224
-; GFX906-NEXT:    global_load_dwordx4 v[9:12], v61, s[6:7] offset:208
-; GFX906-NEXT:    global_load_dwordx4 v[13:16], v61, s[6:7] offset:192
-; GFX906-NEXT:    global_load_dwordx4 v[17:20], v61, s[6:7] offset:176
-; GFX906-NEXT:    global_load_dwordx4 v[21:24], v61, s[6:7] offset:160
-; GFX906-NEXT:    global_load_dwordx4 v[25:28], v61, s[6:7] offset:144
-; GFX906-NEXT:    global_load_dwordx4 v[29:32], v61, s[6:7] offset:128
-; GFX906-NEXT:    global_load_dwordx4 v[33:36], v61, s[6:7] offset:112
-; GFX906-NEXT:    global_load_dwordx4 v[37:40], v61, s[6:7] offset:96
-; GFX906-NEXT:    global_load_dwordx4 v[41:44], v61, s[6:7] offset:80
-; GFX906-NEXT:    global_load_dwordx4 v[45:48], v61, s[6:7] offset:64
-; GFX906-NEXT:    global_load_dwordx4 v[49:52], v61, s[6:7] offset:48
-; GFX906-NEXT:    global_load_dwordx4 v[53:56], v61, s[6:7] offset:32
-; GFX906-NEXT:    global_load_dwordx4 v[57:60], v61, s[6:7] offset:16
-; GFX906-NEXT:    global_load_dwordx4 v[0:3], v61, s[6:7]
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:60 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v8
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:64 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v8
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:68 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v8
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:72 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v7
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:76 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v7
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:80 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v7
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:84 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v6
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:88 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v6
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:92 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v6
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:96 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v5
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:100 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v5
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:104 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v5
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:108 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v12
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:112 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v12
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:116 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v12
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:120 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v11
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:124 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v11
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:128 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v11
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:132 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v10
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:136 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v10
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:140 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v10
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:144 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v9
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:148 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v9
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:152 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v9
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:156 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v16
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:160 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v16
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:164 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v16
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:168 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v15
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:176 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v15
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:180 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v15
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:172 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v14
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:188 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v14
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:192 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v14
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:184 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v13
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:200 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v13
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:204 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v13
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:196 ; 4-byte Folded Spill
+; GFX906-NEXT:    global_load_dwordx4 v[17:20], v63, s[6:7] offset:176
+; GFX906-NEXT:    global_load_dwordx4 v[21:24], v63, s[6:7] offset:160
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v20
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:208 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v20
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:212 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v20
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:224 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v19
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:216 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v19
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:220 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v19
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:236 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v18
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:228 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v18
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:232 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v18
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:248 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v17
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:240 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v17
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:244 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v17
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:252 ; 4-byte Folded Spill
+; GFX906-NEXT:    s_waitcnt vmcnt(12)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v24
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:256 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v24
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:260 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v24
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:272 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v23
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:264 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v23
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:268 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v23
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:284 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v22
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:276 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v22
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:280 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v22
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:296 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v21
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:288 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v21
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:292 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v21
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:300 ; 4-byte Folded Spill
+; GFX906-NEXT:    global_load_dwordx4 v[25:28], v63, s[6:7] offset:144
+; GFX906-NEXT:    global_load_dwordx4 v[29:32], v63, s[6:7] offset:128
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v28
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:304 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v28
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:308 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v28
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:320 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v27
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:312 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v27
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:316 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v27
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:332 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v26
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:324 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v26
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:328 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v26
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:344 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v25
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:336 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v25
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:340 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v25
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:348 ; 4-byte Folded Spill
+; GFX906-NEXT:    s_waitcnt vmcnt(12)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v32
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:352 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v32
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:356 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v32
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:368 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v31
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:360 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v31
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:364 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v31
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:380 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v30
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:372 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v30
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:376 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v30
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:392 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v29
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:384 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v29
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:388 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v29
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:396 ; 4-byte Folded Spill
+; GFX906-NEXT:    global_load_dwordx4 v[33:36], v63, s[6:7] offset:112
+; GFX906-NEXT:    global_load_dwordx4 v[37:40], v63, s[6:7] offset:96
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v36
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:400 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v36
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:404 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v36
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:416 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v35
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:408 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v35
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:412 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v35
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:428 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v34
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:420 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v34
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:424 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v34
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:440 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v33
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:432 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v33
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:436 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v33
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:444 ; 4-byte Folded Spill
+; GFX906-NEXT:    s_waitcnt vmcnt(12)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v40
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:448 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v40
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:452 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v40
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:464 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v39
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:456 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v39
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:460 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v39
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:476 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v38
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:468 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v38
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:472 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v38
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:488 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v37
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:480 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v37
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:484 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v37
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:492 ; 4-byte Folded Spill
+; GFX906-NEXT:    global_load_dwordx4 v[41:44], v63, s[6:7] offset:80
+; GFX906-NEXT:    global_load_dwordx4 v[45:48], v63, s[6:7] offset:64
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v44
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:496 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v44
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:500 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v44
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:512 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v43
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:504 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v43
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:508 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v43
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:524 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v42
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:516 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v42
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:520 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v42
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:536 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v41
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:528 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v41
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:532 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v41
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:540 ; 4-byte Folded Spill
+; GFX906-NEXT:    s_waitcnt vmcnt(12)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v48
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:544 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v48
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:548 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v48
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:560 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v47
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:552 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v47
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:556 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v47
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:572 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v46
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:564 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v46
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:568 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v46
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:584 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v45
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:576 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v45
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:580 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v45
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:588 ; 4-byte Folded Spill
+; GFX906-NEXT:    global_load_dwordx4 v[49:52], v63, s[6:7] offset:48
+; GFX906-NEXT:    global_load_dwordx4 v[53:56], v63, s[6:7] offset:32
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v52
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:592 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v52
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:596 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v52
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:608 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v51
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:600 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v51
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:604 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v51
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:620 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v50
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:612 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v50
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:616 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v50
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:632 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v49
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:624 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v49
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:628 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v49
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:636 ; 4-byte Folded Spill
+; GFX906-NEXT:    s_waitcnt vmcnt(12)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v56
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:640 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v56
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:644 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v56
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:656 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v55
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:648 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v55
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:652 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v55
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:668 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v54
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:660 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v54
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:664 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v54
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:680 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v53
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:672 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v53
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:676 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v53
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:684 ; 4-byte Folded Spill
+; GFX906-NEXT:    global_load_dwordx4 v[57:60], v63, s[6:7] offset:16
+; GFX906-NEXT:    s_nop 0
+; GFX906-NEXT:    global_load_dwordx4 v[0:3], v63, s[6:7]
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 24, v60
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:688 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 16, v60
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:692 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 8, v60
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:704 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 24, v59
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:696 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 16, v59
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:700 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 8, v59
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:716 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 24, v58
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:708 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 16, v58
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:712 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 8, v58
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:728 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 24, v57
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:720 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 16, v57
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:724 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 8, v57
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:732 ; 4-byte Folded Spill
+; GFX906-NEXT:    s_waitcnt vmcnt(12)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 24, v3
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:736 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 16, v3
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:740 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 8, v3
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:752 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 24, v2
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:744 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 16, v2
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:748 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 8, v2
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:764 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 24, v1
+; GFX906-NEXT:    v_lshrrev_b32_e32 v62, 24, v0
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:756 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 16, v1
+; GFX906-NEXT:    buffer_store_dword v62, off, s[8:11], 0 offset:768 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v62, 16, v0
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:760 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 8, v1
+; GFX906-NEXT:    buffer_store_dword v62, off, s[8:11], 0 offset:772 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v62, 8, v0
 ; GFX906-NEXT:  .LBB6_2: ; %bb.2
-; GFX906-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX906-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(0)
-; GFX906-NEXT:    global_store_dwordx4 v4, v[33:36], s[0:1] offset:112
-; GFX906-NEXT:    s_waitcnt vmcnt(7)
-; GFX906-NEXT:    global_store_dwordx4 v4, v[37:40], s[0:1] offset:96
-; GFX906-NEXT:    s_waitcnt vmcnt(7)
-; GFX906-NEXT:    global_store_dwordx4 v4, v[41:44], s[0:1] offset:80
-; GFX906-NEXT:    s_waitcnt vmcnt(7)
-; GFX906-NEXT:    global_store_dwordx4 v4, v[45:48], s[0:1] offset:64
-; GFX906-NEXT:    s_waitcnt vmcnt(7)
-; GFX906-NEXT:    global_store_dwordx4 v4, v[49:52], s[0:1] offset:48
-; GFX906-NEXT:    s_waitcnt vmcnt(7)
-; GFX906-NEXT:    global_store_dwordx4 v4, v[53:56], s[0:1] offset:32
-; GFX906-NEXT:    s_waitcnt vmcnt(7)
-; GFX906-NEXT:    global_store_dwordx4 v4, v[57:60], s[0:1] offset:16
-; GFX906-NEXT:    s_waitcnt vmcnt(7)
-; GFX906-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX906-NEXT:    v_lshlrev_b16_e32 v61, 8, v61
+; GFX906-NEXT:    v_or_b32_sdwa v1, v1, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v61, off, s[8:11], 0 offset:764 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_lshlrev_b16_e32 v62, 8, v62
+; GFX906-NEXT:    v_or_b32_sdwa v0, v0, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v62, off, s[8:11], 0 offset:772 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v63, off, s[8:11], 0 offset:760 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(2)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v61, 8, v61
+; GFX906-NEXT:    v_or_b32_sdwa v2, v2, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v61, off, s[8:11], 0 offset:752 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v61, 8, v61
+; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v61, off, s[8:11], 0 offset:768 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v61, 8, v61
+; GFX906-NEXT:    v_or_b32_sdwa v61, v62, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v62, off, s[8:11], 0 offset:756 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_or_b32_sdwa v0, v0, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v61, off, s[8:11], 0 offset:744 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v62, 8, v62
+; GFX906-NEXT:    v_or_b32_sdwa v62, v63, v62 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v1, v1, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v62, off, s[8:11], 0 offset:748 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v61, 8, v61
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_or_b32_sdwa v61, v62, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v2, v2, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v61, off, s[8:11], 0 offset:736 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v62, off, s[8:11], 0 offset:740 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v61, 8, v61
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_or_b32_sdwa v61, v62, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
+; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:732 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_nop 0
-; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:4 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:8 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:12 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] offset:240
-; GFX906-NEXT:    global_store_dwordx4 v4, v[5:8], s[0:1] offset:224
-; GFX906-NEXT:    global_store_dwordx4 v4, v[9:12], s[0:1] offset:208
-; GFX906-NEXT:    global_store_dwordx4 v4, v[13:16], s[0:1] offset:192
-; GFX906-NEXT:    global_store_dwordx4 v4, v[17:20], s[0:1] offset:176
-; GFX906-NEXT:    global_store_dwordx4 v4, v[21:24], s[0:1] offset:160
-; GFX906-NEXT:    global_store_dwordx4 v4, v[25:28], s[0:1] offset:144
-; GFX906-NEXT:    global_store_dwordx4 v4, v[29:32], s[0:1] offset:128
-; GFX906-NEXT:    s_endpgm
-entry:
-  %idx = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx
-  %vec1 = load <256 x i8>, ptr addrspace(1) %gep1
-  %gep2 = getelementptr <8 x i8>, ptr addrspace(1) %src2, i32 %idx
-  %vec2 = load <256 x i8>, ptr addrspace(1) %gep2
-  %cmp = icmp ult i32 %idx, 15
-  br i1 %cmp, label %bb.1, label %bb.2
-bb.1:
-  br label %bb.2
-
-bb.2:
-  %tmp5 = phi <256 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ]
-  store <256 x i8> %tmp5, ptr addrspace(1) %dst, align 4
-  ret void
-}
-
-define amdgpu_kernel void @repeat_successor(i32 %in, ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
-; GFX906-LABEL: repeat_successor:
-; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_load_dword s8, s[0:1], 0x24
-; GFX906-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
-; GFX906-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x3c
-; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX906-NEXT:    s_cmp_lt_i32 s8, 3
-; GFX906-NEXT:    s_cbranch_scc0 .LBB7_3
-; GFX906-NEXT:  ; %bb.1: ; %LeafBlock
-; GFX906-NEXT:    s_cmp_gt_i32 s8, 0
-; GFX906-NEXT:    s_cbranch_scc0 .LBB7_6
-; GFX906-NEXT:  ; %bb.2:
-; GFX906-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX906-NEXT:    global_load_dword v0, v0, s[4:5]
-; GFX906-NEXT:    s_branch .LBB7_5
-; GFX906-NEXT:  .LBB7_3: ; %LeafBlock5
-; GFX906-NEXT:    s_cmp_eq_u32 s8, 3
-; GFX906-NEXT:    s_cbranch_scc0 .LBB7_6
-; GFX906-NEXT:  ; %bb.4: ; %sw.bb5
-; GFX906-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX906-NEXT:    global_load_dword v0, v0, s[6:7]
-; GFX906-NEXT:  .LBB7_5: ; %return.sink.split
-; GFX906-NEXT:    v_mov_b32_e32 v1, 0
+; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:728 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:716 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:704 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(3)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
+; GFX906-NEXT:    v_or_b32_sdwa v0, v57, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    s_waitcnt vmcnt(2)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; GFX906-NEXT:    buffer_load_dword v57, off, s[8:11], 0 offset:720 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_or_b32_sdwa v1, v58, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v58, off, s[8:11], 0 offset:724 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(3)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
+; GFX906-NEXT:    v_or_b32_sdwa v2, v59, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v59, off, s[8:11], 0 offset:712 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(2)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v57, 8, v57
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_or_b32_sdwa v57, v58, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v58, off, s[8:11], 0 offset:708 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_or_b32_sdwa v0, v0, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v57, off, s[8:11], 0 offset:696 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v58, 8, v58
+; GFX906-NEXT:    v_or_b32_sdwa v58, v59, v58 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v1, v1, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v58, off, s[8:11], 0 offset:700 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v57, 8, v57
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    global_store_dword v1, v0, s[2:3]
-; GFX906-NEXT:  .LBB7_6: ; %return
-; GFX906-NEXT:    s_endpgm
-entry:
-  %idx = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
-  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
-  %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
-  %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
-  switch i32 %in, label %return [
-  i32 1, label %return.sink.split
-  i32 2, label %return.sink.split
-  i32 3, label %sw.bb5
-  ]
-
-sw.bb5:
-  br label %return.sink.split
-
-return.sink.split:
-  %tmp5 = phi <4 x i8> [ %vec2, %sw.bb5 ], [ %vec1, %entry ], [ %vec1, %entry ]
-  store <4 x i8> %tmp5, ptr addrspace(1) %dst, align 4
-  ret void
-
-return:
-  ret void
-}
-
-define amdgpu_kernel void @v8i8_phi_chain(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst0, ptr addrspace(1) nocapture %dst1) {
-; GFX906-LABEL: v8i8_phi_chain:
-; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
-; GFX906-NEXT:    v_lshlrev_b32_e32 v3, 3, v0
-; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v0
-; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX906-NEXT:    global_load_dwordx2 v[1:2], v3, s[0:1]
-; GFX906-NEXT:    v_cmp_lt_u32_e64 s[0:1], 14, v0
-; GFX906-NEXT:    s_and_saveexec_b64 s[8:9], vcc
-; GFX906-NEXT:    s_cbranch_execz .LBB8_2
-; GFX906-NEXT:  ; %bb.1: ; %bb.1
-; GFX906-NEXT:    global_load_dwordx2 v[1:2], v3, s[2:3]
-; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 7, v0
-; GFX906-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
-; GFX906-NEXT:    s_and_b64 s[2:3], vcc, exec
-; GFX906-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
-; GFX906-NEXT:  .LBB8_2: ; %Flow
-; GFX906-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX906-NEXT:    s_and_saveexec_b64 s[2:3], s[0:1]
-; GFX906-NEXT:    s_cbranch_execz .LBB8_4
-; GFX906-NEXT:  ; %bb.3: ; %bb.2
-; GFX906-NEXT:    v_mov_b32_e32 v0, 0
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    global_store_dwordx2 v0, v[1:2], s[4:5]
-; GFX906-NEXT:  .LBB8_4: ; %bb.3
-; GFX906-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX906-NEXT:    v_mov_b32_e32 v0, 0
+; GFX906-NEXT:    v_or_b32_sdwa v57, v58, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v2, v2, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v57, off, s[8:11], 0 offset:688 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v58, off, s[8:11], 0 offset:692 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
+; GFX906-NEXT:    v_or_b32_sdwa v3, v60, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v57, 8, v57
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    global_store_dwordx2 v0, v[1:2], s[6:7]
-; GFX906-NEXT:    s_endpgm
-entry:
-  %idx = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx
-  %vec1 = load <8 x i8>, ptr addrspace(1) %gep1
-  %gep2 = getelementptr <8 x i8>, ptr addrspace(1) %src2, i32 %idx
-  %vec2 = load <8 x i8>, ptr addrspace(1) %gep2
-  %cmp = icmp ult i32 %idx, 15
-  br i1 %cmp, label %bb.1, label %bb.2
-bb.1:
-  %cmp2 = icmp ult i32 %idx, 7
-  br i1 %cmp2, label %bb.2, label %bb.3
-
-bb.2:
-  %tmp5 = phi <8 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ]
-  store <8 x i8> %tmp5, ptr addrspace(1) %dst0, align 4
-  br label %bb.3
-
-bb.3:
-  %tmp7 = phi <8 x i8> [ %vec2, %bb.1], [%tmp5, %bb.2]
-  store <8 x i8> %tmp7, ptr addrspace(1) %dst1, align 4
-  ret void
-}
-
-
-define amdgpu_kernel void @v8i8_phi_zeroinit(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst0, ptr addrspace(1) nocapture %dst1) {
-; GFX906-LABEL: v8i8_phi_zeroinit:
-; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
-; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 3, v0
-; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v0
-; GFX906-NEXT:    ; implicit-def: $vgpr1_vgpr2
-; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX906-NEXT:    global_load_dwordx2 v[3:4], v5, s[0:1]
-; GFX906-NEXT:    v_cmp_lt_u32_e64 s[0:1], 14, v0
-; GFX906-NEXT:    s_and_saveexec_b64 s[8:9], vcc
-; GFX906-NEXT:    s_cbranch_execz .LBB9_2
-; GFX906-NEXT:  ; %bb.1: ; %bb.1
-; GFX906-NEXT:    global_load_dwordx2 v[1:2], v5, s[2:3]
-; GFX906-NEXT:    s_mov_b32 s2, 0
-; GFX906-NEXT:    s_mov_b32 s3, s2
-; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 7, v0
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_mov_b32_e32 v4, s3
-; GFX906-NEXT:    v_mov_b32_e32 v3, s2
-; GFX906-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
-; GFX906-NEXT:    s_and_b64 s[2:3], vcc, exec
-; GFX906-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
-; GFX906-NEXT:  .LBB9_2: ; %Flow
-; GFX906-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX906-NEXT:    s_and_saveexec_b64 s[2:3], s[0:1]
-; GFX906-NEXT:    s_cbranch_execz .LBB9_4
-; GFX906-NEXT:  ; %bb.3: ; %bb.2
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_mov_b32_e32 v1, v3
-; GFX906-NEXT:    v_mov_b32_e32 v0, 0
-; GFX906-NEXT:    v_mov_b32_e32 v2, v4
-; GFX906-NEXT:    global_store_dwordx2 v0, v[3:4], s[4:5]
-; GFX906-NEXT:  .LBB9_4: ; %bb.3
-; GFX906-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX906-NEXT:    v_mov_b32_e32 v0, 0
+; GFX906-NEXT:    v_or_b32_sdwa v57, v58, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:16
+; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:684 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_nop 0
+; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:680 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:668 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:656 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(3)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
+; GFX906-NEXT:    v_or_b32_sdwa v0, v53, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    s_waitcnt vmcnt(2)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; GFX906-NEXT:    buffer_load_dword v53, off, s[8:11], 0 offset:672 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_or_b32_sdwa v1, v54, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v54, off, s[8:11], 0 offset:676 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(3)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
+; GFX906-NEXT:    v_or_b32_sdwa v2, v55, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v55, off, s[8:11], 0 offset:664 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(2)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v53, 8, v53
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_or_b32_sdwa v53, v54, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v54, off, s[8:11], 0 offset:660 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_or_b32_sdwa v0, v0, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v53, off, s[8:11], 0 offset:648 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v54, 8, v54
+; GFX906-NEXT:    v_or_b32_sdwa v54, v55, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v1, v1, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v54, off, s[8:11], 0 offset:652 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v53, 8, v53
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    global_store_dwordx2 v0, v[1:2], s[6:7]
-; GFX906-NEXT:    s_endpgm
-entry:
-  %idx = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx
-  %vec1 = load <8 x i8>, ptr addrspace(1) %gep1
-  %gep2 = getelementptr <8 x i8>, ptr addrspace(1) %src2, i32 %idx
-  %vec2 = load <8 x i8>, ptr addrspace(1) %gep2
-  %cmp = icmp ult i32 %idx, 15
-  br i1 %cmp, label %bb.1, label %bb.2
-bb.1:
-  %cmp2 = icmp ult i32 %idx, 7
-  br i1 %cmp2, label %bb.2, label %bb.3
-
-bb.2:
-  %tmp5 = phi <8 x i8> [ %vec1, %entry ], [ zeroinitializer, %bb.1 ]
-  store <8 x i8> %tmp5, ptr addrspace(1) %dst0, align 4
-  br label %bb.3
-
-bb.3:
-  %tmp7 = phi <8 x i8> [ %vec2, %bb.1], [%tmp5, %bb.2]
-  store <8 x i8> %tmp7, ptr addrspace(1) %dst1, align 4
-  ret void
-}
-
-define amdgpu_kernel void @v8i8_phi_const(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst0, ptr addrspace(1) nocapture %dst1) {
-; GFX906-LABEL: v8i8_phi_const:
-; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
-; GFX906-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
-; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v0
-; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX906-NEXT:    global_load_dwordx2 v[1:2], v1, s[0:1]
-; GFX906-NEXT:    v_cmp_lt_u32_e64 s[0:1], 14, v0
+; GFX906-NEXT:    v_or_b32_sdwa v53, v54, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v2, v2, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v53, off, s[8:11], 0 offset:640 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v54, off, s[8:11], 0 offset:644 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
+; GFX906-NEXT:    v_or_b32_sdwa v3, v56, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v53, 8, v53
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 24, v2
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; GFX906-NEXT:    v_lshrrev_b32_e32 v5, 8, v2
-; GFX906-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
-; GFX906-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
-; GFX906-NEXT:    v_lshrrev_b32_e32 v8, 8, v1
-; GFX906-NEXT:    s_and_saveexec_b64 s[2:3], vcc
-; GFX906-NEXT:    s_cbranch_execz .LBB10_2
-; GFX906-NEXT:  ; %bb.1: ; %bb.1
-; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 7, v0
-; GFX906-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
-; GFX906-NEXT:    s_and_b64 s[6:7], vcc, exec
-; GFX906-NEXT:    v_mov_b32_e32 v1, 1
-; GFX906-NEXT:    v_mov_b32_e32 v8, 2
-; GFX906-NEXT:    v_mov_b32_e32 v6, 3
-; GFX906-NEXT:    v_mov_b32_e32 v7, 4
-; GFX906-NEXT:    v_mov_b32_e32 v2, 5
-; GFX906-NEXT:    v_mov_b32_e32 v5, 6
-; GFX906-NEXT:    v_mov_b32_e32 v3, 7
-; GFX906-NEXT:    v_mov_b32_e32 v4, 8
-; GFX906-NEXT:    s_or_b64 s[0:1], s[0:1], s[6:7]
-; GFX906-NEXT:  .LBB10_2: ; %Flow
-; GFX906-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX906-NEXT:    s_and_saveexec_b64 s[2:3], s[0:1]
-; GFX906-NEXT:    s_cbranch_execz .LBB10_4
-; GFX906-NEXT:  ; %bb.3: ; %bb.2
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v8
-; GFX906-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v7
-; GFX906-NEXT:    v_or_b32_sdwa v1, v6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v53, v54, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:32
+; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:636 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_nop 0
+; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:632 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:620 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:608 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(3)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
+; GFX906-NEXT:    v_or_b32_sdwa v0, v49, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    s_waitcnt vmcnt(2)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; GFX906-NEXT:    buffer_load_dword v49, off, s[8:11], 0 offset:624 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_or_b32_sdwa v1, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v50, off, s[8:11], 0 offset:628 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(3)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
+; GFX906-NEXT:    v_or_b32_sdwa v2, v51, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v51, off, s[8:11], 0 offset:616 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(2)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v49, 8, v49
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_or_b32_sdwa v49, v50, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v50, off, s[8:11], 0 offset:612 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_or_b32_sdwa v0, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v49, off, s[8:11], 0 offset:600 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v50, 8, v50
+; GFX906-NEXT:    v_or_b32_sdwa v50, v51, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v1, v1, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v50, off, s[8:11], 0 offset:604 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v49, 8, v49
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_or_b32_sdwa v49, v50, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v2, v2, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v49, off, s[8:11], 0 offset:592 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v50, off, s[8:11], 0 offset:596 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
+; GFX906-NEXT:    v_or_b32_sdwa v3, v52, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v49, 8, v49
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_or_b32_sdwa v49, v50, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:48
+; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:588 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_nop 0
+; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:584 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:572 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:560 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(3)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
+; GFX906-NEXT:    v_or_b32_sdwa v0, v45, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    s_waitcnt vmcnt(2)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; GFX906-NEXT:    buffer_load_dword v45, off, s[8:11], 0 offset:576 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_or_b32_sdwa v1, v46, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v46, off, s[8:11], 0 offset:580 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(3)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
+; GFX906-NEXT:    v_or_b32_sdwa v2, v47, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v47, off, s[8:11], 0 offset:568 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(2)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v45, 8, v45
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_or_b32_sdwa v45, v46, v45 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v46, off, s[8:11], 0 offset:564 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_or_b32_sdwa v0, v0, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v45, off, s[8:11], 0 offset:552 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v46, 8, v46
+; GFX906-NEXT:    v_or_b32_sdwa v46, v47, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v1, v1, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v46, off, s[8:11], 0 offset:556 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v45, 8, v45
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_or_b32_sdwa v45, v46, v45 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v2, v2, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v45, off, s[8:11], 0 offset:544 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v46, off, s[8:11], 0 offset:548 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
+; GFX906-NEXT:    v_or_b32_sdwa v3, v48, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v45, 8, v45
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_or_b32_sdwa v45, v46, v45 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:64
+; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:540 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_nop 0
+; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:536 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:524 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:512 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(3)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
+; GFX906-NEXT:    v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    s_waitcnt vmcnt(2)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; GFX906-NEXT:    buffer_load_dword v41, off, s[8:11], 0 offset:528 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_or_b32_sdwa v1, v42, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v42, off, s[8:11], 0 offset:532 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(3)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
+; GFX906-NEXT:    v_or_b32_sdwa v2, v43, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v43, off, s[8:11], 0 offset:520 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(2)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v41, 8, v41
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_or_b32_sdwa v41, v42, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v42, off, s[8:11], 0 offset:516 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_or_b32_sdwa v0, v0, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v41, off, s[8:11], 0 offset:504 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v42, 8, v42
+; GFX906-NEXT:    v_or_b32_sdwa v42, v43, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v1, v1, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v42, off, s[8:11], 0 offset:508 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v41, 8, v41
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_or_b32_sdwa v41, v42, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v2, v2, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v41, off, s[8:11], 0 offset:496 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v42, off, s[8:11], 0 offset:500 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
+; GFX906-NEXT:    v_or_b32_sdwa v3, v44, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v41, 8, v41
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_or_b32_sdwa v41, v42, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:80
+; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:492 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_nop 0
+; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:488 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:476 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:464 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(3)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
+; GFX906-NEXT:    v_or_b32_sdwa v0, v37, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    s_waitcnt vmcnt(2)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; GFX906-NEXT:    buffer_load_dword v37, off, s[8:11], 0 offset:480 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_or_b32_sdwa v1, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v38, off, s[8:11], 0 offset:484 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(3)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
+; GFX906-NEXT:    v_or_b32_sdwa v2, v39, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v39, off, s[8:11], 0 offset:472 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(2)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v37, 8, v37
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_or_b32_sdwa v37, v38, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v38, off, s[8:11], 0 offset:468 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_or_b32_sdwa v0, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v37, off, s[8:11], 0 offset:456 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v38, 8, v38
+; GFX906-NEXT:    v_or_b32_sdwa v38, v39, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v1, v1, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v38, off, s[8:11], 0 offset:460 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v37, 8, v37
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_or_b32_sdwa v37, v38, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v2, v2, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v37, off, s[8:11], 0 offset:448 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v38, off, s[8:11], 0 offset:452 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
+; GFX906-NEXT:    v_or_b32_sdwa v3, v40, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v37, 8, v37
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_or_b32_sdwa v37, v38, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:96
+; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:444 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_nop 0
+; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:440 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:428 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:416 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(3)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
+; GFX906-NEXT:    v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    s_waitcnt vmcnt(2)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; GFX906-NEXT:    buffer_load_dword v33, off, s[8:11], 0 offset:432 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_or_b32_sdwa v1, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v34, off, s[8:11], 0 offset:436 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(3)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
+; GFX906-NEXT:    v_or_b32_sdwa v2, v35, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v35, off, s[8:11], 0 offset:424 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(2)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v33, 8, v33
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_or_b32_sdwa v33, v34, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v34, off, s[8:11], 0 offset:420 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_or_b32_sdwa v0, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v33, off, s[8:11], 0 offset:408 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v34, 8, v34
+; GFX906-NEXT:    v_or_b32_sdwa v34, v35, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v1, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v34, off, s[8:11], 0 offset:412 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v33, 8, v33
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_or_b32_sdwa v33, v34, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v2, v2, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v33, off, s[8:11], 0 offset:400 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v34, off, s[8:11], 0 offset:404 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
+; GFX906-NEXT:    v_or_b32_sdwa v3, v36, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v33, 8, v33
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_or_b32_sdwa v33, v34, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:112
+; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:396 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_nop 0
+; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:392 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:380 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:368 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(3)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
+; GFX906-NEXT:    v_or_b32_sdwa v0, v29, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    s_waitcnt vmcnt(2)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; GFX906-NEXT:    buffer_load_dword v29, off, s[8:11], 0 offset:384 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_or_b32_sdwa v1, v30, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v30, off, s[8:11], 0 offset:388 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(3)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
+; GFX906-NEXT:    v_or_b32_sdwa v2, v31, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v31, off, s[8:11], 0 offset:376 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(2)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v29, 8, v29
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_or_b32_sdwa v29, v30, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v30, off, s[8:11], 0 offset:372 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_or_b32_sdwa v0, v0, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v29, off, s[8:11], 0 offset:360 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v30, 8, v30
+; GFX906-NEXT:    v_or_b32_sdwa v30, v31, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v1, v1, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v30, off, s[8:11], 0 offset:364 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v29, 8, v29
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_or_b32_sdwa v29, v30, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v2, v2, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v29, off, s[8:11], 0 offset:352 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v30, off, s[8:11], 0 offset:356 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
+; GFX906-NEXT:    v_or_b32_sdwa v3, v32, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v29, 8, v29
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_or_b32_sdwa v29, v30, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:128
+; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:348 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_nop 0
+; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:344 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:332 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:320 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(3)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
+; GFX906-NEXT:    v_or_b32_sdwa v0, v25, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    s_waitcnt vmcnt(2)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; GFX906-NEXT:    buffer_load_dword v25, off, s[8:11], 0 offset:336 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_or_b32_sdwa v1, v26, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v26, off, s[8:11], 0 offset:340 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(3)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
+; GFX906-NEXT:    v_or_b32_sdwa v2, v27, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v27, off, s[8:11], 0 offset:328 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(2)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v25, 8, v25
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_or_b32_sdwa v25, v26, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v26, off, s[8:11], 0 offset:324 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_or_b32_sdwa v0, v0, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v25, off, s[8:11], 0 offset:312 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v26, 8, v26
+; GFX906-NEXT:    v_or_b32_sdwa v26, v27, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v1, v1, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v26, off, s[8:11], 0 offset:316 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v25, 8, v25
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_or_b32_sdwa v25, v26, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v2, v2, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v25, off, s[8:11], 0 offset:304 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v26, off, s[8:11], 0 offset:308 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
+; GFX906-NEXT:    v_or_b32_sdwa v3, v28, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v25, 8, v25
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_or_b32_sdwa v25, v26, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:144
+; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:300 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_nop 0
+; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:296 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:284 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:272 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(3)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
+; GFX906-NEXT:    v_or_b32_sdwa v0, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    s_waitcnt vmcnt(2)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; GFX906-NEXT:    buffer_load_dword v21, off, s[8:11], 0 offset:288 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_or_b32_sdwa v1, v22, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v22, off, s[8:11], 0 offset:292 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(3)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
+; GFX906-NEXT:    v_or_b32_sdwa v2, v23, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v23, off, s[8:11], 0 offset:280 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(2)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v21, 8, v21
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_or_b32_sdwa v21, v22, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v22, off, s[8:11], 0 offset:276 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_or_b32_sdwa v0, v0, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v21, off, s[8:11], 0 offset:264 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v22, 8, v22
+; GFX906-NEXT:    v_or_b32_sdwa v22, v23, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v1, v1, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v22, off, s[8:11], 0 offset:268 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v21, 8, v21
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_or_b32_sdwa v21, v22, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v2, v2, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v21, off, s[8:11], 0 offset:256 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v22, off, s[8:11], 0 offset:260 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
+; GFX906-NEXT:    v_or_b32_sdwa v3, v24, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v21, 8, v21
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_or_b32_sdwa v21, v22, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:160
+; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:252 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_nop 0
+; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:248 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:236 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:224 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(3)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
+; GFX906-NEXT:    v_or_b32_sdwa v0, v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    s_waitcnt vmcnt(2)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; GFX906-NEXT:    buffer_load_dword v17, off, s[8:11], 0 offset:240 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_or_b32_sdwa v1, v18, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v18, off, s[8:11], 0 offset:244 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(3)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
+; GFX906-NEXT:    v_or_b32_sdwa v2, v19, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v19, off, s[8:11], 0 offset:232 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(2)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v17, 8, v17
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v18, off, s[8:11], 0 offset:228 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_or_b32_sdwa v0, v0, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v17, off, s[8:11], 0 offset:216 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v18, 8, v18
+; GFX906-NEXT:    v_or_b32_sdwa v18, v19, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v1, v1, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v18, off, s[8:11], 0 offset:220 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v17, 8, v17
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v2, v2, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v17, off, s[8:11], 0 offset:208 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v18, off, s[8:11], 0 offset:212 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
+; GFX906-NEXT:    v_or_b32_sdwa v3, v20, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v17, 8, v17
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:176
+; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:200 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_nop 0
+; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:204 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:192 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:184 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(3)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
+; GFX906-NEXT:    s_waitcnt vmcnt(2)
+; GFX906-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:188 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
+; GFX906-NEXT:    v_or_b32_sdwa v3, v14, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v14, off, s[8:11], 0 offset:164 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; GFX906-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:196 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:180 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
+; GFX906-NEXT:    v_or_b32_sdwa v2, v13, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:176 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v13, off, s[8:11], 0 offset:160 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
+; GFX906-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:172 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v13, 8, v13
+; GFX906-NEXT:    v_or_b32_sdwa v13, v14, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
+; GFX906-NEXT:    v_or_b32_sdwa v3, v15, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:168 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
+; GFX906-NEXT:    v_or_b32_sdwa v3, v16, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:192
+; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:156 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_nop 0
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:152 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:148 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:140 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(3)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
+; GFX906-NEXT:    v_or_b32_sdwa v0, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v9, off, s[8:11], 0 offset:128 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(2)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; GFX906-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v5
-; GFX906-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v4
+; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:144 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:136 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
+; GFX906-NEXT:    v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_mov_b32_e32 v9, 0
 ; GFX906-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    global_store_dwordx2 v9, v[0:1], s[4:5]
-; GFX906-NEXT:  .LBB10_4: ; %bb.3
-; GFX906-NEXT:    s_endpgm
-entry:
-  %idx = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx
-  %vec1 = load <8 x i8>, ptr addrspace(1) %gep1
-  %gep2 = getelementptr <8 x i8>, ptr addrspace(1) %src2, i32 %idx
-  %vec2 = load <8 x i8>, ptr addrspace(1) %gep2
-  %cmp = icmp ult i32 %idx, 15
-  br i1 %cmp, label %bb.1, label %bb.2
-bb.1:
-  %cmp2 = icmp ult i32 %idx, 7
-  br i1 %cmp2, label %bb.2, label %bb.3
-
-bb.2:
-  %tmp5 = phi <8 x i8> [ %vec1, %entry ], [<i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8>, %bb.1 ]
-  store <8 x i8> %tmp5, ptr addrspace(1) %dst0, align 4
-  br label %bb.3
-
-bb.3:
-  %tmp7 = phi <8 x i8> [ %vec2, %bb.1], [%tmp5, %bb.2]
-  store <8 x i8> %tmp7, ptr addrspace(1) %dst1, align 4
-  ret void
-}
-
-define amdgpu_kernel void @v8i8_multi_block(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst0, ptr addrspace(1) nocapture %dst1) {
-; GFX906-LABEL: v8i8_multi_block:
-; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
-; GFX906-NEXT:    v_lshlrev_b32_e32 v6, 3, v0
-; GFX906-NEXT:    v_mov_b32_e32 v5, 0
-; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v0
-; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX906-NEXT:    global_load_dwordx2 v[3:4], v6, s[0:1]
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:132 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:124 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v10, off, s[8:11], 0 offset:116 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(2)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
+; GFX906-NEXT:    v_or_b32_sdwa v2, v11, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v3, v9, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:120 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v9, off, s[8:11], 0 offset:112 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_mov_b32_e32 v1, v3
-; GFX906-NEXT:    v_mov_b32_e32 v2, v4
-; GFX906-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; GFX906-NEXT:    s_cbranch_execz .LBB11_4
-; GFX906-NEXT:  ; %bb.1: ; %bb.1
-; GFX906-NEXT:    global_load_dwordx2 v[1:2], v6, s[2:3]
-; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 7, v0
-; GFX906-NEXT:    s_and_saveexec_b64 s[2:3], vcc
-; GFX906-NEXT:    s_cbranch_execz .LBB11_3
-; GFX906-NEXT:  ; %bb.2: ; %bb.2
-; GFX906-NEXT:    v_mov_b32_e32 v0, 0
-; GFX906-NEXT:    global_store_dwordx2 v0, v[3:4], s[4:5]
-; GFX906-NEXT:  .LBB11_3: ; %Flow
-; GFX906-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX906-NEXT:  .LBB11_4: ; %bb.3
-; GFX906-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX906-NEXT:    v_lshlrev_b16_e32 v9, 8, v9
+; GFX906-NEXT:    v_or_b32_sdwa v3, v12, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v9, v10, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:208
+; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:108 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_nop 0
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:104 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:100 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:92 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(3)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
+; GFX906-NEXT:    v_or_b32_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:80 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(2)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; GFX906-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:96 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:88 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    global_store_dwordx2 v5, v[1:2], s[6:7]
-; GFX906-NEXT:    s_endpgm
-entry:
-  %idx = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx
-  %vec1 = load <8 x i8>, ptr addrspace(1) %gep1
-  %gep2 = getelementptr <8 x i8>, ptr addrspace(1) %src2, i32 %idx
-  %vec2 = load <8 x i8>, ptr addrspace(1) %gep2
-  %cmp = icmp ult i32 %idx, 15
-  br i1 %cmp, label %bb.1, label %bb.3
-bb.1:
-  %cmp2 = icmp ult i32 %idx, 7
-  br i1 %cmp2, label %bb.2, label %bb.3
-
-bb.2:
-  store <8 x i8> %vec1, ptr addrspace(1) %dst0, align 4
-  br label %bb.3
-
-bb.3:
-  %tmp5 = phi <8 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ], [ %vec2, %bb.2]
-  store <8 x i8> %tmp5, ptr addrspace(1) %dst1, align 4
-  ret void
-}
-
-define amdgpu_kernel void @v32i8_loop_carried(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
-; GFX906-LABEL: v32i8_loop_carried:
-; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX906-NEXT:    v_lshlrev_b32_e32 v1, 5, v0
-; GFX906-NEXT:    v_cmp_lt_u32_e32 vcc, 14, v0
-; GFX906-NEXT:    s_mov_b32 s4, 0x2000604
-; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX906-NEXT:    global_load_dword v1, v1, s[2:3]
-; GFX906-NEXT:    s_mov_b64 s[2:3], 0
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_mov_b32_e32 v0, v1
-; GFX906-NEXT:  .LBB12_1: ; %bb.1
-; GFX906-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX906-NEXT:    s_and_b64 s[6:7], exec, vcc
-; GFX906-NEXT:    s_or_b64 s[2:3], s[6:7], s[2:3]
-; GFX906-NEXT:    v_perm_b32 v0, v1, v0, s4
-; GFX906-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX906-NEXT:    s_cbranch_execnz .LBB12_1
-; GFX906-NEXT:  ; %bb.2: ; %bb.2.loopexit
-; GFX906-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX906-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX906-NEXT:    v_mov_b32_e32 v1, 0
-; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX906-NEXT:    global_store_dword v1, v0, s[0:1]
-; GFX906-NEXT:    s_endpgm
-entry:
-  %idx = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep1 = getelementptr <32 x i8>, ptr addrspace(1) %src1, i32 %idx
-  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
-  br label %bb.1
-
-bb.1:
-  %temp = phi <4 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ]
-  %vec2 = shufflevector <4 x i8> %vec1, <4 x i8> %temp, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-  %cmp = icmp ult i32 %idx, 15
-  br i1 %cmp, label %bb.1, label %bb.2
-  br label %bb.2
-
-bb.2:
-  store <4 x i8> %vec2, ptr addrspace(1) %dst, align 4
-  ret void
-}
-
-; Should not have instances of "Instruction does not dominate all uses!"
-
-define amdgpu_kernel void @v8i8_multiuse_multiblock(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst1, ptr addrspace(1) nocapture %dst2, ptr addrspace(1) nocapture %dst3) {
-; GFX906-LABEL: v8i8_multiuse_multiblock:
-; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x24
-; GFX906-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
-; GFX906-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x44
-; GFX906-NEXT:    v_cmp_lt_u32_e64 s[2:3], 14, v0
-; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v0
-; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX906-NEXT:    global_load_dwordx2 v[1:2], v1, s[4:5]
+; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
+; GFX906-NEXT:    v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:84 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:76 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:68 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(2)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
+; GFX906-NEXT:    v_or_b32_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:72 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:64 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
-; GFX906-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GFX906-NEXT:    s_cbranch_execz .LBB13_2
-; GFX906-NEXT:  ; %bb.1: ; %bb.1
-; GFX906-NEXT:    s_movk_i32 s6, 0xff00
-; GFX906-NEXT:    v_mov_b32_e32 v5, 8
-; GFX906-NEXT:    v_and_b32_sdwa v6, v1, s6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX906-NEXT:    s_mov_b32 s6, 0x6070504
-; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 7, v0
-; GFX906-NEXT:    v_and_b32_e32 v4, 0xffffff00, v1
-; GFX906-NEXT:    v_lshlrev_b16_sdwa v5, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX906-NEXT:    v_perm_b32 v7, v1, v1, s6
-; GFX906-NEXT:    s_andn2_b64 s[2:3], s[2:3], exec
-; GFX906-NEXT:    s_and_b64 s[6:7], vcc, exec
-; GFX906-NEXT:    v_mov_b32_e32 v3, 0
-; GFX906-NEXT:    v_or_b32_sdwa v4, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v5, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v6, v1, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-; GFX906-NEXT:    s_or_b64 s[2:3], s[2:3], s[6:7]
-; GFX906-NEXT:    v_or_b32_sdwa v6, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    global_store_dword v3, v1, s[8:9]
-; GFX906-NEXT:    global_store_dword v3, v7, s[8:9] offset:8
-; GFX906-NEXT:    global_store_dword v3, v6, s[8:9] offset:16
-; GFX906-NEXT:    global_store_dword v3, v4, s[8:9] offset:24
-; GFX906-NEXT:  .LBB13_2: ; %Flow
-; GFX906-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX906-NEXT:    s_and_saveexec_b64 s[4:5], s[2:3]
-; GFX906-NEXT:    s_cbranch_execz .LBB13_4
-; GFX906-NEXT:  ; %bb.3: ; %bb.2
-; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v2
-; GFX906-NEXT:    v_and_b32_e32 v4, 0xffffff00, v2
-; GFX906-NEXT:    v_and_b32_e32 v5, 0xffffff00, v1
-; GFX906-NEXT:    s_mov_b32 s2, 0xc0c0001
-; GFX906-NEXT:    v_or_b32_sdwa v3, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v4, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v5, v2, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_perm_b32 v2, 0, v2, s2
-; GFX906-NEXT:    v_mov_b32_e32 v0, 0
-; GFX906-NEXT:    v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    v_perm_b32 v6, 0, v1, s2
-; GFX906-NEXT:    s_mov_b32 s3, 0xffff0000
-; GFX906-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX906-NEXT:    v_and_or_b32 v7, v1, s3, v6
-; GFX906-NEXT:    v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_e32 v2, v6, v2
-; GFX906-NEXT:    global_store_dword v0, v3, s[10:11]
-; GFX906-NEXT:    global_store_dword v0, v4, s[10:11] offset:8
-; GFX906-NEXT:    global_store_dword v0, v7, s[10:11] offset:16
-; GFX906-NEXT:    global_store_dword v0, v2, s[10:11] offset:24
-; GFX906-NEXT:  .LBB13_4: ; %bb.3
-; GFX906-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX906-NEXT:    s_movk_i32 s3, 0xff00
-; GFX906-NEXT:    v_mov_b32_e32 v4, 8
-; GFX906-NEXT:    s_movk_i32 s2, 0xff
-; GFX906-NEXT:    v_and_b32_sdwa v2, v1, s3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX906-NEXT:    v_lshlrev_b16_sdwa v4, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX906-NEXT:    v_or_b32_sdwa v3, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v5, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-; GFX906-NEXT:    v_lshlrev_b16_e32 v6, 8, v1
-; GFX906-NEXT:    v_and_b32_sdwa v7, v1, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX906-NEXT:    v_mov_b32_e32 v0, 0
-; GFX906-NEXT:    v_or_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v7, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v4, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v1, v1, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v2, v2, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    global_store_dword v0, v3, s[0:1]
-; GFX906-NEXT:    global_store_dword v0, v1, s[0:1] offset:8
-; GFX906-NEXT:    global_store_dword v0, v4, s[0:1] offset:16
-; GFX906-NEXT:    global_store_dword v0, v2, s[0:1] offset:24
+; GFX906-NEXT:    v_lshlrev_b16_e32 v5, 8, v5
+; GFX906-NEXT:    v_or_b32_sdwa v3, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:224
+; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:60 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_nop 0
+; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:4 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v7, off, s[8:11], 0 offset:8 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v8, off, s[8:11], 0 offset:12 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:52 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:56 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:44 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(7)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
+; GFX906-NEXT:    s_waitcnt vmcnt(3)
+; GFX906-NEXT:    v_or_b32_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    s_waitcnt vmcnt(2)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:48 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:40 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:32 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(2)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
+; GFX906-NEXT:    v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:36 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:28 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:20 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(2)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
+; GFX906-NEXT:    v_or_b32_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:24 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:16 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v5, 8, v5
+; GFX906-NEXT:    v_or_b32_sdwa v3, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:240
 ; GFX906-NEXT:    s_endpgm
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
   %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx
-  %vec1 = load <8 x i8>, ptr addrspace(1) %gep1
+  %vec1 = load <256 x i8>, ptr addrspace(1) %gep1
   %gep2 = getelementptr <8 x i8>, ptr addrspace(1) %src2, i32 %idx
-  %vec2 = load <8 x i8>, ptr addrspace(1) %gep2
+  %vec2 = load <256 x i8>, ptr addrspace(1) %gep2
   %cmp = icmp ult i32 %idx, 15
   br i1 %cmp, label %bb.1, label %bb.2
 bb.1:
-  %s1 = shufflevector <8 x i8> %vec1, <8 x i8> %vec2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %s2 = shufflevector <8 x i8> %vec1, <8 x i8> %vec2, <4 x i32> <i32 0, i32 1, i32 3, i32 2>
-  %s3 = shufflevector <8 x i8> %vec1, <8 x i8> %vec2, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
-  %s4 = shufflevector <8 x i8> %vec1, <8 x i8> %vec2, <4 x i32> <i32 0, i32 2, i32 3, i32 1>
-  %gep4 = getelementptr ptr addrspace(1), ptr addrspace(1) %dst1, i32 0
-  %gep5 = getelementptr ptr addrspace(1), ptr addrspace(1) %dst1, i32 1
-  %gep6 = getelementptr ptr addrspace(1), ptr addrspace(1) %dst1, i32 2
-  %gep7 = getelementptr ptr addrspace(1), ptr addrspace(1) %dst1, i32 3
-  store <4 x i8> %s1, ptr addrspace(1) %gep4, align 4
-  store <4 x i8> %s2, ptr addrspace(1) %gep5, align 4
-  store <4 x i8> %s3, ptr addrspace(1) %gep6, align 4
-  store <4 x i8> %s4, ptr addrspace(1) %gep7, align 4
-  %cmp2 = icmp ult i32 %idx, 7
-  br i1 %cmp2, label %bb.2, label %bb.3
+  br label %bb.2
 
 bb.2:
-  %s5 = shufflevector <8 x i8> %vec1, <8 x i8> %vec2, <4 x i32> <i32 0, i32 3, i32 1, i32 2>
-  %s6 = shufflevector <8 x i8> %vec1, <8 x i8> %vec2, <4 x i32> <i32 0, i32 3, i32 2, i32 1>
-  %s7 = shufflevector <8 x i8> %vec1, <8 x i8> %vec2, <4 x i32> <i32 1, i32 0, i32 2, i32 3>
-  %s8 = shufflevector <8 x i8> %vec1, <8 x i8> %vec2, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-  %gep8 = getelementptr ptr addrspace(1), ptr addrspace(1) %dst2, i32 0
-  %gep9 = getelementptr ptr addrspace(1), ptr addrspace(1) %dst2, i32 1
-  %gep10 = getelementptr ptr addrspace(1), ptr addrspace(1) %dst2, i32 2
-  %gep11 = getelementptr ptr addrspace(1), ptr addrspace(1) %dst2, i32 3
-  store <4 x i8> %s5, ptr addrspace(1) %gep8, align 4
-  store <4 x i8> %s6, ptr addrspace(1) %gep9, align 4
-  store <4 x i8> %s7, ptr addrspace(1) %gep10, align 4
-  store <4 x i8> %s8, ptr addrspace(1) %gep11, align 4
-  br label %bb.3
-
-bb.3:
-  %s9 = shufflevector <8 x i8> %vec1, <8 x i8> %vec2, <4 x i32> <i32 1, i32 2, i32 0, i32 3>
-  %s10 = shufflevector <8 x i8> %vec1, <8 x i8> %vec2, <4 x i32> <i32 1, i32 2, i32 3, i32 0>
-  %s11 = shufflevector <8 x i8> %vec1, <8 x i8> %vec2, <4 x i32> <i32 1, i32 3, i32 0, i32 2>
-  %s12 = shufflevector <8 x i8> %vec1, <8 x i8> %vec2, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
-  %gep12 = getelementptr ptr addrspace(1), ptr addrspace(1) %dst3, i32 0
-  %gep13 = getelementptr ptr addrspace(1), ptr addrspace(1) %dst3, i32 1
-  %gep14 = getelementptr ptr addrspace(1), ptr addrspace(1) %dst3, i32 2
-  %gep15 = getelementptr ptr addrspace(1), ptr addrspace(1) %dst3, i32 3
-  store <4 x i8> %s9, ptr addrspace(1) %gep12, align 4
-  store <4 x i8> %s10, ptr addrspace(1) %gep13, align 4
-  store <4 x i8> %s11, ptr addrspace(1) %gep14, align 4
-  store <4 x i8> %s12, ptr addrspace(1) %gep15, align 4
+  %tmp5 = phi <256 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ]
+  store <256 x i8> %tmp5, ptr addrspace(1) %dst, align 4
   ret void
 }
 
-
 declare i32 @llvm.amdgcn.workitem.id.x()
+

diff  --git a/llvm/test/CodeGen/AMDGPU/vni8-live-reg-opt.ll b/llvm/test/CodeGen/AMDGPU/vni8-live-reg-opt.ll
deleted file mode 100644
index 5d2e299aa854a..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/vni8-live-reg-opt.ll
+++ /dev/null
@@ -1,352 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
-; RUN: opt -mtriple=amdgcn -mcpu=gfx906 -amdgpu-late-codegenprepare -S -o - %s | FileCheck --check-prefix=GFX906 %s
-
-define amdgpu_kernel void @v3i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
-; GFX906-LABEL: define amdgpu_kernel void @v3i8_liveout(
-; GFX906-SAME: ptr addrspace(1) [[SRC1:%.*]], ptr addrspace(1) [[SRC2:%.*]], ptr addrspace(1) nocapture [[DST:%.*]]) #[[ATTR0:[0-9]+]] {
-; GFX906-NEXT:  entry:
-; GFX906-NEXT:    [[IDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
-; GFX906-NEXT:    [[GEP1:%.*]] = getelementptr <3 x i8>, ptr addrspace(1) [[SRC1]], i32 [[IDX]]
-; GFX906-NEXT:    [[VEC1:%.*]] = load <3 x i8>, ptr addrspace(1) [[GEP1]], align 4
-; GFX906-NEXT:    [[TMP0:%.*]] = shufflevector <3 x i8> [[VEC1]], <3 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; GFX906-NEXT:    [[VEC1_BC:%.*]] = bitcast <4 x i8> [[TMP0]] to i32
-; GFX906-NEXT:    [[GEP2:%.*]] = getelementptr <3 x i8>, ptr addrspace(1) [[SRC2]], i32 [[IDX]]
-; GFX906-NEXT:    [[VEC2:%.*]] = load <3 x i8>, ptr addrspace(1) [[GEP2]], align 4
-; GFX906-NEXT:    [[TMP1:%.*]] = shufflevector <3 x i8> [[VEC2]], <3 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; GFX906-NEXT:    [[VEC2_BC:%.*]] = bitcast <4 x i8> [[TMP1]] to i32
-; GFX906-NEXT:    [[CMP:%.*]] = icmp ult i32 [[IDX]], 15
-; GFX906-NEXT:    br i1 [[CMP]], label [[BB_1:%.*]], label [[BB_2:%.*]]
-; GFX906:       bb.1:
-; GFX906-NEXT:    br label [[BB_2]]
-; GFX906:       bb.2:
-; GFX906-NEXT:    [[TMP5_TC:%.*]] = phi i32 [ [[VEC1_BC]], [[ENTRY:%.*]] ], [ [[VEC2_BC]], [[BB_1]] ]
-; GFX906-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP5_TC]] to i24
-; GFX906-NEXT:    [[TMP3:%.*]] = bitcast i24 [[TMP2]] to <3 x i8>
-; GFX906-NEXT:    store <3 x i8> [[TMP3]], ptr addrspace(1) [[DST]], align 4
-; GFX906-NEXT:    ret void
-;
-entry:
-  %idx = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep1 = getelementptr <3 x i8>, ptr addrspace(1) %src1, i32 %idx
-  %vec1 = load <3 x i8>, ptr addrspace(1) %gep1
-  %gep2 = getelementptr <3 x i8>, ptr addrspace(1) %src2, i32 %idx
-  %vec2 = load <3 x i8>, ptr addrspace(1) %gep2
-  %cmp = icmp ult i32 %idx, 15
-  br i1 %cmp, label %bb.1, label %bb.2
-bb.1:
-  br label %bb.2
-
-bb.2:
-  %tmp5 = phi <3 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ]
-  store <3 x i8> %tmp5, ptr addrspace(1) %dst, align 4
-  ret void
-}
-
-define amdgpu_kernel void @v4i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
-; GFX906-LABEL: define amdgpu_kernel void @v4i8_liveout(
-; GFX906-SAME: ptr addrspace(1) [[SRC1:%.*]], ptr addrspace(1) [[SRC2:%.*]], ptr addrspace(1) nocapture [[DST:%.*]]) #[[ATTR0]] {
-; GFX906-NEXT:  entry:
-; GFX906-NEXT:    [[IDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
-; GFX906-NEXT:    [[GEP1:%.*]] = getelementptr <4 x i8>, ptr addrspace(1) [[SRC1]], i32 [[IDX]]
-; GFX906-NEXT:    [[VEC1:%.*]] = load <4 x i8>, ptr addrspace(1) [[GEP1]], align 4
-; GFX906-NEXT:    [[VEC1_BC:%.*]] = bitcast <4 x i8> [[VEC1]] to i32
-; GFX906-NEXT:    [[GEP2:%.*]] = getelementptr <4 x i8>, ptr addrspace(1) [[SRC2]], i32 [[IDX]]
-; GFX906-NEXT:    [[VEC2:%.*]] = load <4 x i8>, ptr addrspace(1) [[GEP2]], align 4
-; GFX906-NEXT:    [[VEC2_BC:%.*]] = bitcast <4 x i8> [[VEC2]] to i32
-; GFX906-NEXT:    [[CMP:%.*]] = icmp ult i32 [[IDX]], 15
-; GFX906-NEXT:    br i1 [[CMP]], label [[BB_1:%.*]], label [[BB_2:%.*]]
-; GFX906:       bb.1:
-; GFX906-NEXT:    br label [[BB_2]]
-; GFX906:       bb.2:
-; GFX906-NEXT:    [[TMP5_TC:%.*]] = phi i32 [ [[VEC1_BC]], [[ENTRY:%.*]] ], [ [[VEC2_BC]], [[BB_1]] ]
-; GFX906-NEXT:    [[TMP5_TC_BC:%.*]] = bitcast i32 [[TMP5_TC]] to <4 x i8>
-; GFX906-NEXT:    store <4 x i8> [[TMP5_TC_BC]], ptr addrspace(1) [[DST]], align 4
-; GFX906-NEXT:    ret void
-;
-entry:
-  %idx = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
-  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
-  %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
-  %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
-  %cmp = icmp ult i32 %idx, 15
-  br i1 %cmp, label %bb.1, label %bb.2
-bb.1:
-  br label %bb.2
-
-bb.2:
-  %tmp5 = phi <4 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ]
-  store <4 x i8> %tmp5, ptr addrspace(1) %dst, align 4
-  ret void
-}
-
-define amdgpu_kernel void @v5i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
-; GFX906-LABEL: define amdgpu_kernel void @v5i8_liveout(
-; GFX906-SAME: ptr addrspace(1) [[SRC1:%.*]], ptr addrspace(1) [[SRC2:%.*]], ptr addrspace(1) nocapture [[DST:%.*]]) #[[ATTR0]] {
-; GFX906-NEXT:  entry:
-; GFX906-NEXT:    [[IDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
-; GFX906-NEXT:    [[GEP1:%.*]] = getelementptr <5 x i8>, ptr addrspace(1) [[SRC1]], i32 [[IDX]]
-; GFX906-NEXT:    [[VEC1:%.*]] = load <5 x i8>, ptr addrspace(1) [[GEP1]], align 8
-; GFX906-NEXT:    [[TMP0:%.*]] = shufflevector <5 x i8> [[VEC1]], <5 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 5, i32 5>
-; GFX906-NEXT:    [[VEC1_BC:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-; GFX906-NEXT:    [[GEP2:%.*]] = getelementptr <5 x i8>, ptr addrspace(1) [[SRC2]], i32 [[IDX]]
-; GFX906-NEXT:    [[VEC2:%.*]] = load <5 x i8>, ptr addrspace(1) [[GEP2]], align 8
-; GFX906-NEXT:    [[TMP1:%.*]] = shufflevector <5 x i8> [[VEC2]], <5 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 5, i32 5>
-; GFX906-NEXT:    [[VEC2_BC:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-; GFX906-NEXT:    [[CMP:%.*]] = icmp ult i32 [[IDX]], 15
-; GFX906-NEXT:    br i1 [[CMP]], label [[BB_1:%.*]], label [[BB_2:%.*]]
-; GFX906:       bb.1:
-; GFX906-NEXT:    br label [[BB_2]]
-; GFX906:       bb.2:
-; GFX906-NEXT:    [[TMP5_TC:%.*]] = phi <2 x i32> [ [[VEC1_BC]], [[ENTRY:%.*]] ], [ [[VEC2_BC]], [[BB_1]] ]
-; GFX906-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[TMP5_TC]] to <8 x i8>
-; GFX906-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i8> [[TMP2]], <8 x i8> poison, <5 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4>
-; GFX906-NEXT:    store <5 x i8> [[TMP3]], ptr addrspace(1) [[DST]], align 4
-; GFX906-NEXT:    ret void
-;
-entry:
-  %idx = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep1 = getelementptr <5 x i8>, ptr addrspace(1) %src1, i32 %idx
-  %vec1 = load <5 x i8>, ptr addrspace(1) %gep1
-  %gep2 = getelementptr <5 x i8>, ptr addrspace(1) %src2, i32 %idx
-  %vec2 = load <5 x i8>, ptr addrspace(1) %gep2
-  %cmp = icmp ult i32 %idx, 15
-  br i1 %cmp, label %bb.1, label %bb.2
-bb.1:
-  br label %bb.2
-
-bb.2:
-  %tmp5 = phi <5 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ]
-  store <5 x i8> %tmp5, ptr addrspace(1) %dst, align 4
-  ret void
-}
-
-define amdgpu_kernel void @v8i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
-; GFX906-LABEL: define amdgpu_kernel void @v8i8_liveout(
-; GFX906-SAME: ptr addrspace(1) [[SRC1:%.*]], ptr addrspace(1) [[SRC2:%.*]], ptr addrspace(1) nocapture [[DST:%.*]]) #[[ATTR0]] {
-; GFX906-NEXT:  entry:
-; GFX906-NEXT:    [[IDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
-; GFX906-NEXT:    [[GEP1:%.*]] = getelementptr <8 x i8>, ptr addrspace(1) [[SRC1]], i32 [[IDX]]
-; GFX906-NEXT:    [[VEC1:%.*]] = load <8 x i8>, ptr addrspace(1) [[GEP1]], align 8
-; GFX906-NEXT:    [[VEC1_BC:%.*]] = bitcast <8 x i8> [[VEC1]] to <2 x i32>
-; GFX906-NEXT:    [[GEP2:%.*]] = getelementptr <8 x i8>, ptr addrspace(1) [[SRC2]], i32 [[IDX]]
-; GFX906-NEXT:    [[VEC2:%.*]] = load <8 x i8>, ptr addrspace(1) [[GEP2]], align 8
-; GFX906-NEXT:    [[VEC2_BC:%.*]] = bitcast <8 x i8> [[VEC2]] to <2 x i32>
-; GFX906-NEXT:    [[CMP:%.*]] = icmp ult i32 [[IDX]], 15
-; GFX906-NEXT:    br i1 [[CMP]], label [[BB_1:%.*]], label [[BB_2:%.*]]
-; GFX906:       bb.1:
-; GFX906-NEXT:    br label [[BB_2]]
-; GFX906:       bb.2:
-; GFX906-NEXT:    [[TMP5_TC:%.*]] = phi <2 x i32> [ [[VEC1_BC]], [[ENTRY:%.*]] ], [ [[VEC2_BC]], [[BB_1]] ]
-; GFX906-NEXT:    [[TMP5_TC_BC:%.*]] = bitcast <2 x i32> [[TMP5_TC]] to <8 x i8>
-; GFX906-NEXT:    store <8 x i8> [[TMP5_TC_BC]], ptr addrspace(1) [[DST]], align 4
-; GFX906-NEXT:    ret void
-;
-entry:
-  %idx = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx
-  %vec1 = load <8 x i8>, ptr addrspace(1) %gep1
-  %gep2 = getelementptr <8 x i8>, ptr addrspace(1) %src2, i32 %idx
-  %vec2 = load <8 x i8>, ptr addrspace(1) %gep2
-  %cmp = icmp ult i32 %idx, 15
-  br i1 %cmp, label %bb.1, label %bb.2
-bb.1:
-  br label %bb.2
-
-bb.2:
-  %tmp5 = phi <8 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ]
-  store <8 x i8> %tmp5, ptr addrspace(1) %dst, align 4
-  ret void
-}
-
-define amdgpu_kernel void @repeat_successor(i32 %in, ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
-; GFX906-LABEL: define amdgpu_kernel void @repeat_successor(
-; GFX906-SAME: i32 [[IN:%.*]], ptr addrspace(1) [[SRC1:%.*]], ptr addrspace(1) [[SRC2:%.*]], ptr addrspace(1) nocapture [[DST:%.*]]) #[[ATTR0]] {
-; GFX906-NEXT:  entry:
-; GFX906-NEXT:    [[IDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
-; GFX906-NEXT:    [[GEP1:%.*]] = getelementptr <4 x i8>, ptr addrspace(1) [[SRC1]], i32 [[IDX]]
-; GFX906-NEXT:    [[VEC1:%.*]] = load <4 x i8>, ptr addrspace(1) [[GEP1]], align 4
-; GFX906-NEXT:    [[VEC1_BC:%.*]] = bitcast <4 x i8> [[VEC1]] to i32
-; GFX906-NEXT:    [[GEP2:%.*]] = getelementptr <4 x i8>, ptr addrspace(1) [[SRC2]], i32 [[IDX]]
-; GFX906-NEXT:    [[VEC2:%.*]] = load <4 x i8>, ptr addrspace(1) [[GEP2]], align 4
-; GFX906-NEXT:    [[VEC2_BC:%.*]] = bitcast <4 x i8> [[VEC2]] to i32
-; GFX906-NEXT:    switch i32 [[IN]], label [[RETURN:%.*]] [
-; GFX906-NEXT:      i32 1, label [[RETURN_SINK_SPLIT:%.*]]
-; GFX906-NEXT:      i32 2, label [[RETURN_SINK_SPLIT]]
-; GFX906-NEXT:      i32 3, label [[SW_BB5:%.*]]
-; GFX906-NEXT:    ]
-; GFX906:       sw.bb5:
-; GFX906-NEXT:    br label [[RETURN_SINK_SPLIT]]
-; GFX906:       return.sink.split:
-; GFX906-NEXT:    [[TMP5_TC:%.*]] = phi i32 [ [[VEC2_BC]], [[SW_BB5]] ], [ [[VEC1_BC]], [[ENTRY:%.*]] ], [ [[VEC1_BC]], [[ENTRY]] ]
-; GFX906-NEXT:    [[TMP5_TC_BC:%.*]] = bitcast i32 [[TMP5_TC]] to <4 x i8>
-; GFX906-NEXT:    store <4 x i8> [[TMP5_TC_BC]], ptr addrspace(1) [[DST]], align 4
-; GFX906-NEXT:    ret void
-; GFX906:       return:
-; GFX906-NEXT:    ret void
-;
-entry:
-  %idx = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
-  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
-  %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
-  %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
-  switch i32 %in, label %return [
-  i32 1, label %return.sink.split
-  i32 2, label %return.sink.split
-  i32 3, label %sw.bb5
-  ]
-
-sw.bb5:
-  br label %return.sink.split
-
-return.sink.split:
-  %tmp5 = phi <4 x i8> [ %vec2, %sw.bb5 ], [ %vec1, %entry ], [ %vec1, %entry ]
-  store <4 x i8> %tmp5, ptr addrspace(1) %dst, align 4
-  ret void
-
-return:
-  ret void
-}
-
-define amdgpu_kernel void @v8i8_phi_chain(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst0, ptr addrspace(1) nocapture %dst1) {
-; GFX906-LABEL: define amdgpu_kernel void @v8i8_phi_chain(
-; GFX906-SAME: ptr addrspace(1) [[SRC1:%.*]], ptr addrspace(1) [[SRC2:%.*]], ptr addrspace(1) nocapture [[DST0:%.*]], ptr addrspace(1) nocapture [[DST1:%.*]]) #[[ATTR0]] {
-; GFX906-NEXT:  entry:
-; GFX906-NEXT:    [[IDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
-; GFX906-NEXT:    [[GEP1:%.*]] = getelementptr <8 x i8>, ptr addrspace(1) [[SRC1]], i32 [[IDX]]
-; GFX906-NEXT:    [[VEC1:%.*]] = load <8 x i8>, ptr addrspace(1) [[GEP1]], align 8
-; GFX906-NEXT:    [[VEC1_BC:%.*]] = bitcast <8 x i8> [[VEC1]] to <2 x i32>
-; GFX906-NEXT:    [[GEP2:%.*]] = getelementptr <8 x i8>, ptr addrspace(1) [[SRC2]], i32 [[IDX]]
-; GFX906-NEXT:    [[VEC2:%.*]] = load <8 x i8>, ptr addrspace(1) [[GEP2]], align 8
-; GFX906-NEXT:    [[VEC2_BC:%.*]] = bitcast <8 x i8> [[VEC2]] to <2 x i32>
-; GFX906-NEXT:    [[CMP:%.*]] = icmp ult i32 [[IDX]], 15
-; GFX906-NEXT:    br i1 [[CMP]], label [[BB_1:%.*]], label [[BB_2:%.*]]
-; GFX906:       bb.1:
-; GFX906-NEXT:    [[CMP2:%.*]] = icmp ult i32 [[IDX]], 7
-; GFX906-NEXT:    br i1 [[CMP2]], label [[BB_2]], label [[BB_3:%.*]]
-; GFX906:       bb.2:
-; GFX906-NEXT:    [[TMP5_TC:%.*]] = phi <2 x i32> [ [[VEC1_BC]], [[ENTRY:%.*]] ], [ [[VEC2_BC]], [[BB_1]] ]
-; GFX906-NEXT:    [[TMP5_TC_BC:%.*]] = bitcast <2 x i32> [[TMP5_TC]] to <8 x i8>
-; GFX906-NEXT:    store <8 x i8> [[TMP5_TC_BC]], ptr addrspace(1) [[DST0]], align 4
-; GFX906-NEXT:    br label [[BB_3]]
-; GFX906:       bb.3:
-; GFX906-NEXT:    [[TMP7_TC:%.*]] = phi <2 x i32> [ [[VEC2_BC]], [[BB_1]] ], [ [[TMP5_TC]], [[BB_2]] ]
-; GFX906-NEXT:    [[TMP7_TC_BC:%.*]] = bitcast <2 x i32> [[TMP7_TC]] to <8 x i8>
-; GFX906-NEXT:    store <8 x i8> [[TMP7_TC_BC]], ptr addrspace(1) [[DST1]], align 4
-; GFX906-NEXT:    ret void
-;
-entry:
-  %idx = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx
-  %vec1 = load <8 x i8>, ptr addrspace(1) %gep1
-  %gep2 = getelementptr <8 x i8>, ptr addrspace(1) %src2, i32 %idx
-  %vec2 = load <8 x i8>, ptr addrspace(1) %gep2
-  %cmp = icmp ult i32 %idx, 15
-  br i1 %cmp, label %bb.1, label %bb.2
-bb.1:
-  %cmp2 = icmp ult i32 %idx, 7
-  br i1 %cmp2, label %bb.2, label %bb.3
-
-bb.2:
-  %tmp5 = phi <8 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ]
-  store <8 x i8> %tmp5, ptr addrspace(1) %dst0, align 4
-  br label %bb.3
-
-bb.3:
-  %tmp7 = phi <8 x i8> [ %vec2, %bb.1], [%tmp5, %bb.2]
-  store <8 x i8> %tmp7, ptr addrspace(1) %dst1, align 4
-  ret void
-}
-
-define amdgpu_kernel void @v8i8_multi_block(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst0, ptr addrspace(1) nocapture %dst1) {
-; GFX906-LABEL: define amdgpu_kernel void @v8i8_multi_block(
-; GFX906-SAME: ptr addrspace(1) [[SRC1:%.*]], ptr addrspace(1) [[SRC2:%.*]], ptr addrspace(1) nocapture [[DST0:%.*]], ptr addrspace(1) nocapture [[DST1:%.*]]) #[[ATTR0]] {
-; GFX906-NEXT:  entry:
-; GFX906-NEXT:    [[IDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
-; GFX906-NEXT:    [[GEP1:%.*]] = getelementptr <8 x i8>, ptr addrspace(1) [[SRC1]], i32 [[IDX]]
-; GFX906-NEXT:    [[VEC1:%.*]] = load <8 x i8>, ptr addrspace(1) [[GEP1]], align 8
-; GFX906-NEXT:    [[VEC1_BC:%.*]] = bitcast <8 x i8> [[VEC1]] to <2 x i32>
-; GFX906-NEXT:    [[GEP2:%.*]] = getelementptr <8 x i8>, ptr addrspace(1) [[SRC2]], i32 [[IDX]]
-; GFX906-NEXT:    [[VEC2:%.*]] = load <8 x i8>, ptr addrspace(1) [[GEP2]], align 8
-; GFX906-NEXT:    [[VEC2_BC:%.*]] = bitcast <8 x i8> [[VEC2]] to <2 x i32>
-; GFX906-NEXT:    [[CMP:%.*]] = icmp ult i32 [[IDX]], 15
-; GFX906-NEXT:    br i1 [[CMP]], label [[BB_1:%.*]], label [[BB_3:%.*]]
-; GFX906:       bb.1:
-; GFX906-NEXT:    [[CMP2:%.*]] = icmp ult i32 [[IDX]], 7
-; GFX906-NEXT:    br i1 [[CMP2]], label [[BB_2:%.*]], label [[BB_3]]
-; GFX906:       bb.2:
-; GFX906-NEXT:    [[VEC1_BC_BC:%.*]] = bitcast <2 x i32> [[VEC1_BC]] to <8 x i8>
-; GFX906-NEXT:    store <8 x i8> [[VEC1_BC_BC]], ptr addrspace(1) [[DST0]], align 4
-; GFX906-NEXT:    br label [[BB_3]]
-; GFX906:       bb.3:
-; GFX906-NEXT:    [[TMP5_TC:%.*]] = phi <2 x i32> [ [[VEC1_BC]], [[ENTRY:%.*]] ], [ [[VEC2_BC]], [[BB_1]] ], [ [[VEC2_BC]], [[BB_2]] ]
-; GFX906-NEXT:    [[TMP5_TC_BC:%.*]] = bitcast <2 x i32> [[TMP5_TC]] to <8 x i8>
-; GFX906-NEXT:    store <8 x i8> [[TMP5_TC_BC]], ptr addrspace(1) [[DST1]], align 4
-; GFX906-NEXT:    ret void
-;
-entry:
-  %idx = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx
-  %vec1 = load <8 x i8>, ptr addrspace(1) %gep1
-  %gep2 = getelementptr <8 x i8>, ptr addrspace(1) %src2, i32 %idx
-  %vec2 = load <8 x i8>, ptr addrspace(1) %gep2
-  %cmp = icmp ult i32 %idx, 15
-  br i1 %cmp, label %bb.1, label %bb.3
-bb.1:
-  %cmp2 = icmp ult i32 %idx, 7
-  br i1 %cmp2, label %bb.2, label %bb.3
-
-bb.2:
-  store <8 x i8> %vec1, ptr addrspace(1) %dst0, align 4
-  br label %bb.3
-
-bb.3:
-  %tmp5 = phi <8 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ], [ %vec2, %bb.2]
-  store <8 x i8> %tmp5, ptr addrspace(1) %dst1, align 4
-  ret void
-}
-
-define amdgpu_kernel void @v32i8_loop_carried(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
-; GFX906-LABEL: define amdgpu_kernel void @v32i8_loop_carried(
-; GFX906-SAME: ptr addrspace(1) [[SRC1:%.*]], ptr addrspace(1) [[SRC2:%.*]], ptr addrspace(1) nocapture [[DST:%.*]]) #[[ATTR0]] {
-; GFX906-NEXT:  entry:
-; GFX906-NEXT:    [[IDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
-; GFX906-NEXT:    [[GEP1:%.*]] = getelementptr <32 x i8>, ptr addrspace(1) [[SRC1]], i32 [[IDX]]
-; GFX906-NEXT:    [[VEC1:%.*]] = load <4 x i8>, ptr addrspace(1) [[GEP1]], align 4
-; GFX906-NEXT:    [[VEC1_BC:%.*]] = bitcast <4 x i8> [[VEC1]] to i32
-; GFX906-NEXT:    br label [[BB_1:%.*]]
-; GFX906:       bb.1:
-; GFX906-NEXT:    [[TEMP_TC:%.*]] = phi i32 [ [[VEC1_BC]], [[ENTRY:%.*]] ], [ [[VEC2_BC:%.*]], [[BB_1]] ]
-; GFX906-NEXT:    [[TEMP_TC_BC:%.*]] = bitcast i32 [[TEMP_TC]] to <4 x i8>
-; GFX906-NEXT:    [[VEC1_BC_BC:%.*]] = bitcast i32 [[VEC1_BC]] to <4 x i8>
-; GFX906-NEXT:    [[VEC2:%.*]] = shufflevector <4 x i8> [[VEC1_BC_BC]], <4 x i8> [[TEMP_TC_BC]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-; GFX906-NEXT:    [[VEC2_BC]] = bitcast <4 x i8> [[VEC2]] to i32
-; GFX906-NEXT:    [[CMP:%.*]] = icmp ult i32 [[IDX]], 15
-; GFX906-NEXT:    br i1 [[CMP]], label [[BB_1]], label [[BB_2:%.*]]
-; GFX906:       0:
-; GFX906-NEXT:    br label [[BB_2]]
-; GFX906:       bb.2:
-; GFX906-NEXT:    [[VEC2_BC_BC:%.*]] = bitcast i32 [[VEC2_BC]] to <4 x i8>
-; GFX906-NEXT:    store <4 x i8> [[VEC2_BC_BC]], ptr addrspace(1) [[DST]], align 4
-; GFX906-NEXT:    ret void
-;
-entry:
-  %idx = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep1 = getelementptr <32 x i8>, ptr addrspace(1) %src1, i32 %idx
-  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
-  br label %bb.1
-
-bb.1:
-  %temp = phi <4 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ]
-  %vec2 = shufflevector <4 x i8> %vec1, <4 x i8> %temp, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-  %cmp = icmp ult i32 %idx, 15
-  br i1 %cmp, label %bb.1, label %bb.2
-  br label %bb.2
-
-bb.2:
-  store <4 x i8> %vec2, ptr addrspace(1) %dst, align 4
-  ret void
-}
-
-declare i32 @llvm.amdgcn.workitem.id.x()


        


More information about the llvm-commits mailing list