[llvm] [AMDGPU] Add IR LiveReg type-based optimization (PR #66838)

Jeffrey Byrnes via llvm-commits llvm-commits at lists.llvm.org
Tue Apr 30 16:52:24 PDT 2024


https://github.com/jrbyrnes updated https://github.com/llvm/llvm-project/pull/66838

>From 5efb9556bc49b97e72789065cfed9013388c6f50 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Thu, 14 Sep 2023 12:20:06 -0700
Subject: [PATCH 01/11] [AMDGPU] Add IR LiveReg type-based optimization

Change-Id: Ide8a46cdaf1d2d82cbd5296c998a5c8fd41fce80
---
 .../Target/AMDGPU/AMDGPUCodeGenPrepare.cpp    |  343 +++
 .../amdgpu-codegenprepare-break-large-phis.ll |  125 +-
 .../test/CodeGen/AMDGPU/vni8-across-blocks.ll | 1880 ++---------------
 3 files changed, 546 insertions(+), 1802 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index 6e7d34f5adaa3f..8f1dd1c522b04c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -107,6 +107,7 @@ class AMDGPUCodeGenPrepareImpl
   Module *Mod = nullptr;
   const DataLayout *DL = nullptr;
   bool HasUnsafeFPMath = false;
+  bool UsesGlobalISel = false;
   bool HasFP32DenormalFlush = false;
   bool FlowChanged = false;
   mutable Function *SqrtF32 = nullptr;
@@ -343,6 +344,85 @@ class AMDGPUCodeGenPrepare : public FunctionPass {
   StringRef getPassName() const override { return "AMDGPU IR optimizations"; }
 };
 
+class LiveRegConversion {
+private:
+  // The instruction which defined the original virtual register used across
+  // blocks
+  Instruction *LiveRegDef;
+  // The original type
+  Type *OriginalType;
+  // The desired type
+  Type *NewType;
+  // The instruction sequence that converts the virtual register, to be used
+  // instead of the original
+  std::optional<Instruction *> Converted;
+  // The builder used to build the conversion instruction
+  IRBuilder<> ConvertBuilder;
+
+public:
+  // The instruction which defined the original virtual register used across
+  // blocks
+  Instruction *getLiveRegDef() { return LiveRegDef; }
+  // The original type
+  Type *getOriginalType() { return OriginalType; }
+  // The desired type
+  Type *getNewType() { return NewType; }
+  void setNewType(Type *NewType) { this->NewType = NewType; }
+  // The instruction that conerts the virtual register, to be used instead of
+  // the original
+  std::optional<Instruction *> &getConverted() { return Converted; }
+  void setConverted(Instruction *Converted) { this->Converted = Converted; }
+  // The builder used to build the conversion instruction
+  IRBuilder<> &getConverBuilder() { return ConvertBuilder; }
+  // Do we have a instruction sequence which convert the original virtual
+  // register
+  bool hasConverted() { return Converted.has_value(); }
+
+  LiveRegConversion(Instruction *LiveRegDef, BasicBlock *InsertBlock,
+                    BasicBlock::iterator InsertPt)
+      : LiveRegDef(LiveRegDef), OriginalType(LiveRegDef->getType()),
+        ConvertBuilder(InsertBlock, InsertPt) {}
+  LiveRegConversion(Instruction *LiveRegDef, Type *NewType,
+                    BasicBlock *InsertBlock, BasicBlock::iterator InsertPt)
+      : LiveRegDef(LiveRegDef), OriginalType(LiveRegDef->getType()),
+        NewType(NewType), ConvertBuilder(InsertBlock, InsertPt) {}
+};
+
+class LiveRegOptimizer {
+private:
+  Module *Mod = nullptr;
+  // The scalar type to convert to
+  Type *ConvertToScalar;
+  // Holds the collection of PHIs with their pending new operands
+  SmallVector<std::pair<Instruction *,
+                        SmallVector<std::pair<Instruction *, BasicBlock *>, 4>>,
+              4>
+      PHIUpdater;
+
+public:
+  // Should the def of the instruction be converted if it is live across blocks
+  bool shouldReplaceUses(const Instruction &I);
+  // Convert the virtual register to the compatible vector of legal type
+  void convertToOptType(LiveRegConversion &LR);
+  // Convert the virtual register back to the original type, stripping away
+  // the MSBs in cases where there was an imperfect fit (e.g. v2i32 -> v7i8)
+  void convertFromOptType(LiveRegConversion &LR);
+  // Get a vector of desired scalar type that is compatible with the original
+  // vector. In cases where there is no bitsize equivalent using a legal vector
+  // type, we pad the MSBs (e.g. v7i8 -> v2i32)
+  Type *getCompatibleType(Instruction *InstToConvert);
+  // Find and replace uses of the virtual register in different block with a
+  // newly produced virtual register of legal type
+  bool replaceUses(Instruction &I);
+  // Replace the collected PHIs with newly produced incoming values. Replacement
+  // is only done if we have a replacement for each original incoming value.
+  bool replacePHIs();
+
+  LiveRegOptimizer(Module *Mod) : Mod(Mod) {
+    ConvertToScalar = Type::getInt32Ty(Mod->getContext());
+  }
+};
+
 } // end anonymous namespace
 
 bool AMDGPUCodeGenPrepareImpl::run(Function &F) {
@@ -360,6 +440,7 @@ bool AMDGPUCodeGenPrepareImpl::run(Function &F) {
       Next = std::next(I);
 
       MadeChange |= visit(*I);
+      I->getType();
 
       if (Next != E) { // Control flow changed
         BasicBlock *NextInstBB = Next->getParent();
@@ -371,9 +452,269 @@ bool AMDGPUCodeGenPrepareImpl::run(Function &F) {
       }
     }
   }
+
+  // GlobalISel should directly use the values, and do not need to emit
+  // CopyTo/CopyFrom Regs across blocks
+  if (UsesGlobalISel)
+    return MadeChange;
+
+  // "Optimize" the virtual regs that cross basic block boundaries. In such
+  // cases, vectors of illegal types will be scalarized and widened, with each
+  // scalar living in its own physical register. The optimization converts the
+  // vectors to equivalent vectors of legal type (which are convereted back
+  // before uses in subsequenmt blocks), to pack the bits into fewer physical
+  // registers (used in CopyToReg/CopyFromReg pairs).
+  LiveRegOptimizer LRO(Mod);
+  for (auto &BB : F) {
+    for (auto &I : BB) {
+      if (!LRO.shouldReplaceUses(I))
+        continue;
+      MadeChange |= LRO.replaceUses(I);
+    }
+  }
+
+  MadeChange |= LRO.replacePHIs();
+  return MadeChange;
+}
+
+bool LiveRegOptimizer::replaceUses(Instruction &I) {
+  bool MadeChange = false;
+
+  struct ConvertUseInfo {
+    Instruction *Converted;
+    SmallVector<Instruction *, 4> Users;
+  };
+  DenseMap<BasicBlock *, ConvertUseInfo> UseConvertTracker;
+
+  LiveRegConversion FromLRC(
+      &I, I.getParent(),
+      static_cast<BasicBlock::iterator>(std::next(I.getIterator())));
+  FromLRC.setNewType(getCompatibleType(FromLRC.getLiveRegDef()));
+  for (auto IUser = I.user_begin(); IUser != I.user_end(); IUser++) {
+
+    if (auto UserInst = dyn_cast<Instruction>(*IUser)) {
+      if (UserInst->getParent() != I.getParent()) {
+        LLVM_DEBUG(dbgs() << *UserInst << "\n\tUses "
+                          << *FromLRC.getOriginalType()
+                          << " from previous block. Needs conversion\n");
+        convertToOptType(FromLRC);
+        if (!FromLRC.hasConverted())
+          continue;
+        // If it is a PHI node, just create and collect the new operand. We can
+        // only replace the PHI node once we have converted all the operands
+        if (auto PhiInst = dyn_cast<PHINode>(UserInst)) {
+          for (unsigned Idx = 0; Idx < PhiInst->getNumIncomingValues(); Idx++) {
+            auto IncVal = PhiInst->getIncomingValue(Idx);
+            if (&I == dyn_cast<Instruction>(IncVal)) {
+              auto IncBlock = PhiInst->getIncomingBlock(Idx);
+              auto PHIOps = find_if(
+                  PHIUpdater,
+                  [&UserInst](
+                      std::pair<Instruction *,
+                                SmallVector<
+                                    std::pair<Instruction *, BasicBlock *>, 4>>
+                          &Entry) { return Entry.first == UserInst; });
+
+              if (PHIOps == PHIUpdater.end())
+                PHIUpdater.push_back(
+                    {UserInst, {{*FromLRC.getConverted(), IncBlock}}});
+              else
+                PHIOps->second.push_back({*FromLRC.getConverted(), IncBlock});
+
+              break;
+            }
+          }
+          continue;
+        }
+
+        // Do not create multiple conversion sequences if there are multiple
+        // uses in the same block
+        if (UseConvertTracker.contains(UserInst->getParent())) {
+          UseConvertTracker[UserInst->getParent()].Users.push_back(UserInst);
+          LLVM_DEBUG(dbgs() << "\tUser already has access to converted def\n");
+          continue;
+        }
+
+        LiveRegConversion ToLRC(*FromLRC.getConverted(), I.getType(),
+                                UserInst->getParent(),
+                                static_cast<BasicBlock::iterator>(
+                                    UserInst->getParent()->getFirstNonPHIIt()));
+        convertFromOptType(ToLRC);
+        assert(ToLRC.hasConverted());
+        UseConvertTracker[UserInst->getParent()] = {*ToLRC.getConverted(),
+                                                    {UserInst}};
+      }
+    }
+  }
+
+  // Replace uses of with in a separate loop that is not dependent upon the
+  // state of the uses
+  for (auto &Entry : UseConvertTracker) {
+    for (auto &UserInst : Entry.second.Users) {
+      LLVM_DEBUG(dbgs() << *UserInst
+                        << "\n\tNow uses: " << *Entry.second.Converted << "\n");
+      UserInst->replaceUsesOfWith(&I, Entry.second.Converted);
+      MadeChange = true;
+    }
+  }
+  return MadeChange;
+}
+
+bool LiveRegOptimizer::replacePHIs() {
+  bool MadeChange = false;
+  for (auto Ele : PHIUpdater) {
+    auto ThePHINode = dyn_cast<PHINode>(Ele.first);
+    assert(ThePHINode);
+    auto NewPHINodeOps = Ele.second;
+    LLVM_DEBUG(dbgs() << "Attempting to replace: " << *ThePHINode << "\n");
+    // If we have conveted all the required operands, then do the replacement
+    if (ThePHINode->getNumIncomingValues() == NewPHINodeOps.size()) {
+      IRBuilder<> Builder(Ele.first);
+      auto NPHI = Builder.CreatePHI(NewPHINodeOps[0].first->getType(),
+                                    NewPHINodeOps.size());
+      for (auto IncVals : NewPHINodeOps) {
+        NPHI->addIncoming(IncVals.first, IncVals.second);
+        LLVM_DEBUG(dbgs() << "  Using: " << *IncVals.first
+                          << "  For: " << IncVals.second->getName() << "\n");
+      }
+      LLVM_DEBUG(dbgs() << "Sucessfully replaced with " << *NPHI << "\n");
+      LiveRegConversion ToLRC(NPHI, ThePHINode->getType(),
+                              ThePHINode->getParent(),
+                              static_cast<BasicBlock::iterator>(
+                                  ThePHINode->getParent()->getFirstNonPHIIt()));
+      convertFromOptType(ToLRC);
+      assert(ToLRC.hasConverted());
+      Ele.first->replaceAllUsesWith(*ToLRC.getConverted());
+      // The old PHI is no longer used
+      ThePHINode->eraseFromParent();
+      MadeChange = true;
+    }
+  }
   return MadeChange;
 }
 
+Type *LiveRegOptimizer::getCompatibleType(Instruction *InstToConvert) {
+  auto OriginalType = InstToConvert->getType();
+  assert(OriginalType->getScalarSizeInBits() <=
+         ConvertToScalar->getScalarSizeInBits());
+  auto VTy = dyn_cast<VectorType>(OriginalType);
+  if (!VTy)
+    return ConvertToScalar;
+
+  auto OriginalSize =
+      VTy->getScalarSizeInBits() * VTy->getElementCount().getFixedValue();
+  auto ConvertScalarSize = ConvertToScalar->getScalarSizeInBits();
+  auto ConvertEltCount =
+      (OriginalSize + ConvertScalarSize - 1) / ConvertScalarSize;
+
+  return VectorType::get(Type::getIntNTy(Mod->getContext(), ConvertScalarSize),
+                         llvm::ElementCount::getFixed(ConvertEltCount));
+}
+
+void LiveRegOptimizer::convertToOptType(LiveRegConversion &LR) {
+  if (LR.hasConverted()) {
+    LLVM_DEBUG(dbgs() << "\tAlready has converted def\n");
+    return;
+  }
+
+  auto VTy = dyn_cast<VectorType>(LR.getOriginalType());
+  assert(VTy);
+  auto NewVTy = dyn_cast<VectorType>(LR.getNewType());
+  assert(NewVTy);
+
+  auto V = static_cast<Value *>(LR.getLiveRegDef());
+  auto OriginalSize =
+      VTy->getScalarSizeInBits() * VTy->getElementCount().getFixedValue();
+  auto NewSize =
+      NewVTy->getScalarSizeInBits() * NewVTy->getElementCount().getFixedValue();
+
+  auto &Builder = LR.getConverBuilder();
+
+  // If there is a bitsize match, we can fit the old vector into a new vector of
+  // desired type
+  if (OriginalSize == NewSize) {
+    LR.setConverted(dyn_cast<Instruction>(Builder.CreateBitCast(V, NewVTy)));
+    LLVM_DEBUG(dbgs() << "\tConverted def to "
+                      << *(*LR.getConverted())->getType() << "\n");
+    return;
+  }
+
+  // If there is a bitsize mismatch, we must use a wider vector
+  assert(NewSize > OriginalSize);
+  auto ExpandedVecElementCount =
+      llvm::ElementCount::getFixed(NewSize / VTy->getScalarSizeInBits());
+
+  SmallVector<int, 8> ShuffleMask;
+  for (unsigned I = 0; I < VTy->getElementCount().getFixedValue(); I++)
+    ShuffleMask.push_back(I);
+
+  for (uint64_t I = VTy->getElementCount().getFixedValue();
+       I < ExpandedVecElementCount.getFixedValue(); I++)
+    ShuffleMask.push_back(VTy->getElementCount().getFixedValue());
+
+  auto ExpandedVec =
+      dyn_cast<Instruction>(Builder.CreateShuffleVector(V, ShuffleMask));
+  LR.setConverted(
+      dyn_cast<Instruction>(Builder.CreateBitCast(ExpandedVec, NewVTy)));
+  LLVM_DEBUG(dbgs() << "\tConverted def to " << *(*LR.getConverted())->getType()
+                    << "\n");
+  return;
+}
+
+void LiveRegOptimizer::convertFromOptType(LiveRegConversion &LRC) {
+  auto VTy = dyn_cast<VectorType>(LRC.getOriginalType());
+  assert(VTy);
+  auto NewVTy = dyn_cast<VectorType>(LRC.getNewType());
+  assert(NewVTy);
+
+  auto V = static_cast<Value *>(LRC.getLiveRegDef());
+  auto OriginalSize =
+      VTy->getScalarSizeInBits() * VTy->getElementCount().getFixedValue();
+  auto NewSize =
+      NewVTy->getScalarSizeInBits() * NewVTy->getElementCount().getFixedValue();
+
+  auto &Builder = LRC.getConverBuilder();
+
+  // If there is a bitsize match, we simply convert back to the original type
+  if (OriginalSize == NewSize) {
+    LRC.setConverted(dyn_cast<Instruction>(Builder.CreateBitCast(V, NewVTy)));
+    LLVM_DEBUG(dbgs() << "\tProduced for user: " << **LRC.getConverted()
+                      << "\n");
+    return;
+  }
+
+  // If there is a bitsize mismatch, we have used a wider vector and must strip
+  // the MSBs to convert back to the original type
+  assert(OriginalSize > NewSize);
+  auto ExpandedVecElementCount = llvm::ElementCount::getFixed(
+      OriginalSize / NewVTy->getScalarSizeInBits());
+  auto ExpandedVT = VectorType::get(
+      Type::getIntNTy(Mod->getContext(), NewVTy->getScalarSizeInBits()),
+      ExpandedVecElementCount);
+  auto Converted = dyn_cast<Instruction>(
+      Builder.CreateBitCast(LRC.getLiveRegDef(), ExpandedVT));
+
+  auto NarrowElementCount = NewVTy->getElementCount().getFixedValue();
+  SmallVector<int, 8> ShuffleMask;
+  for (uint64_t I = 0; I < NarrowElementCount; I++)
+    ShuffleMask.push_back(I);
+
+  auto NarrowVec = dyn_cast<Instruction>(
+      Builder.CreateShuffleVector(Converted, ShuffleMask));
+  LRC.setConverted(dyn_cast<Instruction>(NarrowVec));
+  LLVM_DEBUG(dbgs() << "\tProduced for user: " << **LRC.getConverted() << "\n");
+  return;
+}
+
+bool LiveRegOptimizer::shouldReplaceUses(const Instruction &I) {
+  // Vectors of illegal types are copied across blocks in an efficient manner.
+  // They are scalarized and widened to legal scalars. In such cases, we can do
+  // better by using legal vector types
+  auto IType = I.getType();
+  return IType->isVectorTy() && IType->getScalarSizeInBits() < 16 &&
+         !I.getType()->getScalarType()->isPointerTy();
+}
+
 unsigned AMDGPUCodeGenPrepareImpl::getBaseElementBitWidth(const Type *T) const {
   assert(needsPromotionToI32(T) && "T does not need promotion to i32");
 
@@ -2275,6 +2616,7 @@ bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) {
   Impl.ST = &TM.getSubtarget<GCNSubtarget>(F);
   Impl.AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
   Impl.UA = &getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
+  Impl.UsesGlobalISel = TM.Options.EnableGlobalISel;
   auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
   Impl.DT = DTWP ? &DTWP->getDomTree() : nullptr;
   Impl.HasUnsafeFPMath = hasUnsafeFPMath(F);
@@ -2297,6 +2639,7 @@ PreservedAnalyses AMDGPUCodeGenPreparePass::run(Function &F,
   Impl.DT = FAM.getCachedResult<DominatorTreeAnalysis>(F);
   Impl.HasUnsafeFPMath = hasUnsafeFPMath(F);
   SIModeRegisterDefaults Mode(F, *Impl.ST);
+  Impl.UsesGlobalISel = TM.Options.EnableGlobalISel;
   Impl.HasFP32DenormalFlush =
       Mode.FP32Denormals == DenormalMode::getPreserveSign();
   PreservedAnalyses PA = PreservedAnalyses::none();
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-break-large-phis.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-break-large-phis.ll
index 93b9aeac3cd3f3..f4871fa1314426 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-break-large-phis.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-break-large-phis.ll
@@ -495,10 +495,15 @@ define amdgpu_kernel void @phi_v23i8(<23 x i8> %in, ptr %out, i1 %cond) {
 ; OPT:       then:
 ; OPT-NEXT:    [[X:%.*]] = insertelement <23 x i8> [[IN:%.*]], i8 42, i32 3
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE0:%.*]] = shufflevector <23 x i8> [[X]], <23 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; OPT-NEXT:    [[TMP0:%.*]] = bitcast <4 x i8> [[LARGEPHI_EXTRACTSLICE0]] to <1 x i32>
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE2:%.*]] = shufflevector <23 x i8> [[X]], <23 x i8> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; OPT-NEXT:    [[TMP1:%.*]] = bitcast <4 x i8> [[LARGEPHI_EXTRACTSLICE2]] to <1 x i32>
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE4:%.*]] = shufflevector <23 x i8> [[X]], <23 x i8> poison, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; OPT-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[LARGEPHI_EXTRACTSLICE4]] to <1 x i32>
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE6:%.*]] = shufflevector <23 x i8> [[X]], <23 x i8> poison, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; OPT-NEXT:    [[TMP3:%.*]] = bitcast <4 x i8> [[LARGEPHI_EXTRACTSLICE6]] to <1 x i32>
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE8:%.*]] = shufflevector <23 x i8> [[X]], <23 x i8> poison, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; OPT-NEXT:    [[TMP4:%.*]] = bitcast <4 x i8> [[LARGEPHI_EXTRACTSLICE8]] to <1 x i32>
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE10:%.*]] = extractelement <23 x i8> [[X]], i64 20
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE12:%.*]] = extractelement <23 x i8> [[X]], i64 21
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE14:%.*]] = extractelement <23 x i8> [[X]], i64 22
@@ -506,31 +511,41 @@ define amdgpu_kernel void @phi_v23i8(<23 x i8> %in, ptr %out, i1 %cond) {
 ; OPT:       else:
 ; OPT-NEXT:    [[Y:%.*]] = insertelement <23 x i8> [[IN]], i8 64, i32 6
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE1:%.*]] = shufflevector <23 x i8> [[Y]], <23 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; OPT-NEXT:    [[TMP5:%.*]] = bitcast <4 x i8> [[LARGEPHI_EXTRACTSLICE1]] to <1 x i32>
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE3:%.*]] = shufflevector <23 x i8> [[Y]], <23 x i8> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; OPT-NEXT:    [[TMP6:%.*]] = bitcast <4 x i8> [[LARGEPHI_EXTRACTSLICE3]] to <1 x i32>
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE5:%.*]] = shufflevector <23 x i8> [[Y]], <23 x i8> poison, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; OPT-NEXT:    [[TMP7:%.*]] = bitcast <4 x i8> [[LARGEPHI_EXTRACTSLICE5]] to <1 x i32>
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE7:%.*]] = shufflevector <23 x i8> [[Y]], <23 x i8> poison, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; OPT-NEXT:    [[TMP8:%.*]] = bitcast <4 x i8> [[LARGEPHI_EXTRACTSLICE7]] to <1 x i32>
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE9:%.*]] = shufflevector <23 x i8> [[Y]], <23 x i8> poison, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; OPT-NEXT:    [[TMP9:%.*]] = bitcast <4 x i8> [[LARGEPHI_EXTRACTSLICE9]] to <1 x i32>
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE11:%.*]] = extractelement <23 x i8> [[Y]], i64 20
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE13:%.*]] = extractelement <23 x i8> [[Y]], i64 21
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE15:%.*]] = extractelement <23 x i8> [[Y]], i64 22
 ; OPT-NEXT:    br label [[FINALLY]]
 ; OPT:       finally:
-; OPT-NEXT:    [[TMP0:%.*]] = phi <4 x i8> [ [[LARGEPHI_EXTRACTSLICE0]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE1]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP1:%.*]] = phi <4 x i8> [ [[LARGEPHI_EXTRACTSLICE2]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE3]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP2:%.*]] = phi <4 x i8> [ [[LARGEPHI_EXTRACTSLICE4]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE5]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP3:%.*]] = phi <4 x i8> [ [[LARGEPHI_EXTRACTSLICE6]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE7]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP4:%.*]] = phi <4 x i8> [ [[LARGEPHI_EXTRACTSLICE8]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE9]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP5:%.*]] = phi i8 [ [[LARGEPHI_EXTRACTSLICE10]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE11]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP6:%.*]] = phi i8 [ [[LARGEPHI_EXTRACTSLICE12]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE13]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP7:%.*]] = phi i8 [ [[LARGEPHI_EXTRACTSLICE14]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE15]], [[ELSE]] ]
-; OPT-NEXT:    [[LARGEPHI_INSERTSLICE0:%.*]] = call <23 x i8> @llvm.vector.insert.v23i8.v4i8(<23 x i8> poison, <4 x i8> [[TMP0]], i64 0)
-; OPT-NEXT:    [[LARGEPHI_INSERTSLICE1:%.*]] = call <23 x i8> @llvm.vector.insert.v23i8.v4i8(<23 x i8> [[LARGEPHI_INSERTSLICE0]], <4 x i8> [[TMP1]], i64 4)
-; OPT-NEXT:    [[LARGEPHI_INSERTSLICE2:%.*]] = call <23 x i8> @llvm.vector.insert.v23i8.v4i8(<23 x i8> [[LARGEPHI_INSERTSLICE1]], <4 x i8> [[TMP2]], i64 8)
-; OPT-NEXT:    [[LARGEPHI_INSERTSLICE3:%.*]] = call <23 x i8> @llvm.vector.insert.v23i8.v4i8(<23 x i8> [[LARGEPHI_INSERTSLICE2]], <4 x i8> [[TMP3]], i64 12)
-; OPT-NEXT:    [[LARGEPHI_INSERTSLICE4:%.*]] = call <23 x i8> @llvm.vector.insert.v23i8.v4i8(<23 x i8> [[LARGEPHI_INSERTSLICE3]], <4 x i8> [[TMP4]], i64 16)
-; OPT-NEXT:    [[LARGEPHI_INSERTSLICE5:%.*]] = insertelement <23 x i8> [[LARGEPHI_INSERTSLICE4]], i8 [[TMP5]], i64 20
-; OPT-NEXT:    [[LARGEPHI_INSERTSLICE6:%.*]] = insertelement <23 x i8> [[LARGEPHI_INSERTSLICE5]], i8 [[TMP6]], i64 21
-; OPT-NEXT:    [[LARGEPHI_INSERTSLICE7:%.*]] = insertelement <23 x i8> [[LARGEPHI_INSERTSLICE6]], i8 [[TMP7]], i64 22
+; OPT-NEXT:    [[TMP10:%.*]] = phi <1 x i32> [ [[TMP0]], [[THEN]] ], [ [[TMP5]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP11:%.*]] = phi <1 x i32> [ [[TMP1]], [[THEN]] ], [ [[TMP6]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP12:%.*]] = phi <1 x i32> [ [[TMP2]], [[THEN]] ], [ [[TMP7]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP13:%.*]] = phi <1 x i32> [ [[TMP3]], [[THEN]] ], [ [[TMP8]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP14:%.*]] = phi <1 x i32> [ [[TMP4]], [[THEN]] ], [ [[TMP9]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP15:%.*]] = phi i8 [ [[LARGEPHI_EXTRACTSLICE10]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE11]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP16:%.*]] = phi i8 [ [[LARGEPHI_EXTRACTSLICE12]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE13]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP17:%.*]] = phi i8 [ [[LARGEPHI_EXTRACTSLICE14]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE15]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP18:%.*]] = bitcast <1 x i32> [[TMP14]] to <4 x i8>
+; OPT-NEXT:    [[TMP19:%.*]] = bitcast <1 x i32> [[TMP13]] to <4 x i8>
+; OPT-NEXT:    [[TMP20:%.*]] = bitcast <1 x i32> [[TMP12]] to <4 x i8>
+; OPT-NEXT:    [[TMP21:%.*]] = bitcast <1 x i32> [[TMP11]] to <4 x i8>
+; OPT-NEXT:    [[TMP22:%.*]] = bitcast <1 x i32> [[TMP10]] to <4 x i8>
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE0:%.*]] = call <23 x i8> @llvm.vector.insert.v23i8.v4i8(<23 x i8> poison, <4 x i8> [[TMP22]], i64 0)
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE1:%.*]] = call <23 x i8> @llvm.vector.insert.v23i8.v4i8(<23 x i8> [[LARGEPHI_INSERTSLICE0]], <4 x i8> [[TMP21]], i64 4)
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE2:%.*]] = call <23 x i8> @llvm.vector.insert.v23i8.v4i8(<23 x i8> [[LARGEPHI_INSERTSLICE1]], <4 x i8> [[TMP20]], i64 8)
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE3:%.*]] = call <23 x i8> @llvm.vector.insert.v23i8.v4i8(<23 x i8> [[LARGEPHI_INSERTSLICE2]], <4 x i8> [[TMP19]], i64 12)
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE4:%.*]] = call <23 x i8> @llvm.vector.insert.v23i8.v4i8(<23 x i8> [[LARGEPHI_INSERTSLICE3]], <4 x i8> [[TMP18]], i64 16)
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE5:%.*]] = insertelement <23 x i8> [[LARGEPHI_INSERTSLICE4]], i8 [[TMP15]], i64 20
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE6:%.*]] = insertelement <23 x i8> [[LARGEPHI_INSERTSLICE5]], i8 [[TMP16]], i64 21
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE7:%.*]] = insertelement <23 x i8> [[LARGEPHI_INSERTSLICE6]], i8 [[TMP17]], i64 22
 ; OPT-NEXT:    store <23 x i8> [[LARGEPHI_INSERTSLICE7]], ptr [[OUT:%.*]], align 1
 ; OPT-NEXT:    ret void
 ;
@@ -539,13 +554,19 @@ define amdgpu_kernel void @phi_v23i8(<23 x i8> %in, ptr %out, i1 %cond) {
 ; NOOPT-NEXT:    br i1 [[COND:%.*]], label [[THEN:%.*]], label [[ELSE:%.*]]
 ; NOOPT:       then:
 ; NOOPT-NEXT:    [[X:%.*]] = insertelement <23 x i8> [[IN:%.*]], i8 42, i32 3
+; NOOPT-NEXT:    [[TMP0:%.*]] = shufflevector <23 x i8> [[X]], <23 x i8> poison, <24 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; NOOPT-NEXT:    [[TMP1:%.*]] = bitcast <24 x i8> [[TMP0]] to <6 x i32>
 ; NOOPT-NEXT:    br label [[FINALLY:%.*]]
 ; NOOPT:       else:
 ; NOOPT-NEXT:    [[Y:%.*]] = insertelement <23 x i8> [[IN]], i8 64, i32 6
+; NOOPT-NEXT:    [[TMP2:%.*]] = shufflevector <23 x i8> [[Y]], <23 x i8> poison, <24 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; NOOPT-NEXT:    [[TMP3:%.*]] = bitcast <24 x i8> [[TMP2]] to <6 x i32>
 ; NOOPT-NEXT:    br label [[FINALLY]]
 ; NOOPT:       finally:
-; NOOPT-NEXT:    [[VAL:%.*]] = phi <23 x i8> [ [[X]], [[THEN]] ], [ [[Y]], [[ELSE]] ]
-; NOOPT-NEXT:    store <23 x i8> [[VAL]], ptr [[OUT:%.*]], align 1
+; NOOPT-NEXT:    [[TMP4:%.*]] = phi <6 x i32> [ [[TMP1]], [[THEN]] ], [ [[TMP3]], [[ELSE]] ]
+; NOOPT-NEXT:    [[TMP5:%.*]] = bitcast <6 x i32> [[TMP4]] to <24 x i8>
+; NOOPT-NEXT:    [[TMP6:%.*]] = shufflevector <24 x i8> [[TMP5]], <24 x i8> poison, <23 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22>
+; NOOPT-NEXT:    store <23 x i8> [[TMP6]], ptr [[OUT:%.*]], align 1
 ; NOOPT-NEXT:    ret void
 ;
 entry:
@@ -572,31 +593,36 @@ define amdgpu_kernel void @phi_v23i8_zeroinit(<23 x i8> %in, ptr %out, i1 %cond)
 ; OPT:       else:
 ; OPT-NEXT:    [[Y:%.*]] = insertelement <23 x i8> [[IN:%.*]], i8 64, i32 6
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE1:%.*]] = shufflevector <23 x i8> [[Y]], <23 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; OPT-NEXT:    [[TMP0:%.*]] = bitcast <4 x i8> [[LARGEPHI_EXTRACTSLICE1]] to <1 x i32>
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE3:%.*]] = shufflevector <23 x i8> [[Y]], <23 x i8> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; OPT-NEXT:    [[TMP1:%.*]] = bitcast <4 x i8> [[LARGEPHI_EXTRACTSLICE3]] to <1 x i32>
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE5:%.*]] = shufflevector <23 x i8> [[Y]], <23 x i8> poison, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; OPT-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[LARGEPHI_EXTRACTSLICE5]] to <1 x i32>
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE7:%.*]] = shufflevector <23 x i8> [[Y]], <23 x i8> poison, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; OPT-NEXT:    [[TMP3:%.*]] = bitcast <4 x i8> [[LARGEPHI_EXTRACTSLICE7]] to <1 x i32>
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE9:%.*]] = shufflevector <23 x i8> [[Y]], <23 x i8> poison, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; OPT-NEXT:    [[TMP4:%.*]] = bitcast <4 x i8> [[LARGEPHI_EXTRACTSLICE9]] to <1 x i32>
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE11:%.*]] = extractelement <23 x i8> [[Y]], i64 20
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE13:%.*]] = extractelement <23 x i8> [[Y]], i64 21
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE15:%.*]] = extractelement <23 x i8> [[Y]], i64 22
 ; OPT-NEXT:    br label [[FINALLY]]
 ; OPT:       finally:
-; OPT-NEXT:    [[TMP0:%.*]] = phi <4 x i8> [ zeroinitializer, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE1]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP1:%.*]] = phi <4 x i8> [ zeroinitializer, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE3]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP2:%.*]] = phi <4 x i8> [ zeroinitializer, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE5]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP3:%.*]] = phi <4 x i8> [ zeroinitializer, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE7]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP4:%.*]] = phi <4 x i8> [ zeroinitializer, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE9]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP5:%.*]] = phi i8 [ 0, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE11]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP6:%.*]] = phi i8 [ 0, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE13]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP7:%.*]] = phi i8 [ 0, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE15]], [[ELSE]] ]
-; OPT-NEXT:    [[LARGEPHI_INSERTSLICE0:%.*]] = call <23 x i8> @llvm.vector.insert.v23i8.v4i8(<23 x i8> poison, <4 x i8> [[TMP0]], i64 0)
-; OPT-NEXT:    [[LARGEPHI_INSERTSLICE1:%.*]] = call <23 x i8> @llvm.vector.insert.v23i8.v4i8(<23 x i8> [[LARGEPHI_INSERTSLICE0]], <4 x i8> [[TMP1]], i64 4)
-; OPT-NEXT:    [[LARGEPHI_INSERTSLICE2:%.*]] = call <23 x i8> @llvm.vector.insert.v23i8.v4i8(<23 x i8> [[LARGEPHI_INSERTSLICE1]], <4 x i8> [[TMP2]], i64 8)
-; OPT-NEXT:    [[LARGEPHI_INSERTSLICE3:%.*]] = call <23 x i8> @llvm.vector.insert.v23i8.v4i8(<23 x i8> [[LARGEPHI_INSERTSLICE2]], <4 x i8> [[TMP3]], i64 12)
-; OPT-NEXT:    [[LARGEPHI_INSERTSLICE4:%.*]] = call <23 x i8> @llvm.vector.insert.v23i8.v4i8(<23 x i8> [[LARGEPHI_INSERTSLICE3]], <4 x i8> [[TMP4]], i64 16)
-; OPT-NEXT:    [[LARGEPHI_INSERTSLICE5:%.*]] = insertelement <23 x i8> [[LARGEPHI_INSERTSLICE4]], i8 [[TMP5]], i64 20
-; OPT-NEXT:    [[LARGEPHI_INSERTSLICE6:%.*]] = insertelement <23 x i8> [[LARGEPHI_INSERTSLICE5]], i8 [[TMP6]], i64 21
-; OPT-NEXT:    [[LARGEPHI_INSERTSLICE7:%.*]] = insertelement <23 x i8> [[LARGEPHI_INSERTSLICE6]], i8 [[TMP7]], i64 22
+; OPT-NEXT:    [[TMP5:%.*]] = phi <4 x i8> [ zeroinitializer, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE1]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP6:%.*]] = phi <4 x i8> [ zeroinitializer, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE3]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP7:%.*]] = phi <4 x i8> [ zeroinitializer, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE5]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP8:%.*]] = phi <4 x i8> [ zeroinitializer, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE7]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP9:%.*]] = phi <4 x i8> [ zeroinitializer, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE9]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP10:%.*]] = phi i8 [ 0, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE11]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP11:%.*]] = phi i8 [ 0, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE13]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP12:%.*]] = phi i8 [ 0, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE15]], [[ELSE]] ]
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE0:%.*]] = call <23 x i8> @llvm.vector.insert.v23i8.v4i8(<23 x i8> poison, <4 x i8> [[TMP5]], i64 0)
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE1:%.*]] = call <23 x i8> @llvm.vector.insert.v23i8.v4i8(<23 x i8> [[LARGEPHI_INSERTSLICE0]], <4 x i8> [[TMP6]], i64 4)
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE2:%.*]] = call <23 x i8> @llvm.vector.insert.v23i8.v4i8(<23 x i8> [[LARGEPHI_INSERTSLICE1]], <4 x i8> [[TMP7]], i64 8)
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE3:%.*]] = call <23 x i8> @llvm.vector.insert.v23i8.v4i8(<23 x i8> [[LARGEPHI_INSERTSLICE2]], <4 x i8> [[TMP8]], i64 12)
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE4:%.*]] = call <23 x i8> @llvm.vector.insert.v23i8.v4i8(<23 x i8> [[LARGEPHI_INSERTSLICE3]], <4 x i8> [[TMP9]], i64 16)
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE5:%.*]] = insertelement <23 x i8> [[LARGEPHI_INSERTSLICE4]], i8 [[TMP10]], i64 20
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE6:%.*]] = insertelement <23 x i8> [[LARGEPHI_INSERTSLICE5]], i8 [[TMP11]], i64 21
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE7:%.*]] = insertelement <23 x i8> [[LARGEPHI_INSERTSLICE6]], i8 [[TMP12]], i64 22
 ; OPT-NEXT:    store <23 x i8> [[LARGEPHI_INSERTSLICE7]], ptr [[OUT:%.*]], align 1
 ; OPT-NEXT:    ret void
 ;
@@ -607,6 +633,8 @@ define amdgpu_kernel void @phi_v23i8_zeroinit(<23 x i8> %in, ptr %out, i1 %cond)
 ; NOOPT-NEXT:    br label [[FINALLY:%.*]]
 ; NOOPT:       else:
 ; NOOPT-NEXT:    [[Y:%.*]] = insertelement <23 x i8> [[IN:%.*]], i8 64, i32 6
+; NOOPT-NEXT:    [[TMP0:%.*]] = shufflevector <23 x i8> [[Y]], <23 x i8> poison, <24 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; NOOPT-NEXT:    [[TMP1:%.*]] = bitcast <24 x i8> [[TMP0]] to <6 x i32>
 ; NOOPT-NEXT:    br label [[FINALLY]]
 ; NOOPT:       finally:
 ; NOOPT-NEXT:    [[VAL:%.*]] = phi <23 x i8> [ zeroinitializer, [[THEN]] ], [ [[Y]], [[ELSE]] ]
@@ -635,25 +663,28 @@ define amdgpu_kernel void @phi_v15i8_random_constant_init(<15 x i8> %in, ptr %ou
 ; OPT:       else:
 ; OPT-NEXT:    [[Y:%.*]] = insertelement <15 x i8> [[IN:%.*]], i8 64, i32 6
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE1:%.*]] = shufflevector <15 x i8> [[Y]], <15 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; OPT-NEXT:    [[TMP0:%.*]] = bitcast <4 x i8> [[LARGEPHI_EXTRACTSLICE1]] to <1 x i32>
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE3:%.*]] = shufflevector <15 x i8> [[Y]], <15 x i8> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; OPT-NEXT:    [[TMP1:%.*]] = bitcast <4 x i8> [[LARGEPHI_EXTRACTSLICE3]] to <1 x i32>
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE5:%.*]] = shufflevector <15 x i8> [[Y]], <15 x i8> poison, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; OPT-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[LARGEPHI_EXTRACTSLICE5]] to <1 x i32>
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE7:%.*]] = extractelement <15 x i8> [[Y]], i64 12
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE9:%.*]] = extractelement <15 x i8> [[Y]], i64 13
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE11:%.*]] = extractelement <15 x i8> [[Y]], i64 14
 ; OPT-NEXT:    br label [[FINALLY]]
 ; OPT:       finally:
-; OPT-NEXT:    [[TMP0:%.*]] = phi <4 x i8> [ <i8 poison, i8 1, i8 2, i8 3>, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE1]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP1:%.*]] = phi <4 x i8> [ <i8 4, i8 undef, i8 6, i8 7>, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE3]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP2:%.*]] = phi <4 x i8> [ <i8 9, i8 10, i8 11, i8 12>, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE5]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP3:%.*]] = phi i8 [ 13, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE7]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP4:%.*]] = phi i8 [ 14, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE9]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP5:%.*]] = phi i8 [ undef, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE11]], [[ELSE]] ]
-; OPT-NEXT:    [[LARGEPHI_INSERTSLICE0:%.*]] = call <15 x i8> @llvm.vector.insert.v15i8.v4i8(<15 x i8> poison, <4 x i8> [[TMP0]], i64 0)
-; OPT-NEXT:    [[LARGEPHI_INSERTSLICE1:%.*]] = call <15 x i8> @llvm.vector.insert.v15i8.v4i8(<15 x i8> [[LARGEPHI_INSERTSLICE0]], <4 x i8> [[TMP1]], i64 4)
-; OPT-NEXT:    [[LARGEPHI_INSERTSLICE2:%.*]] = call <15 x i8> @llvm.vector.insert.v15i8.v4i8(<15 x i8> [[LARGEPHI_INSERTSLICE1]], <4 x i8> [[TMP2]], i64 8)
-; OPT-NEXT:    [[LARGEPHI_INSERTSLICE3:%.*]] = insertelement <15 x i8> [[LARGEPHI_INSERTSLICE2]], i8 [[TMP3]], i64 12
-; OPT-NEXT:    [[LARGEPHI_INSERTSLICE4:%.*]] = insertelement <15 x i8> [[LARGEPHI_INSERTSLICE3]], i8 [[TMP4]], i64 13
-; OPT-NEXT:    [[LARGEPHI_INSERTSLICE5:%.*]] = insertelement <15 x i8> [[LARGEPHI_INSERTSLICE4]], i8 [[TMP5]], i64 14
+; OPT-NEXT:    [[TMP3:%.*]] = phi <4 x i8> [ <i8 poison, i8 1, i8 2, i8 3>, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE1]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP4:%.*]] = phi <4 x i8> [ <i8 4, i8 undef, i8 6, i8 7>, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE3]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP5:%.*]] = phi <4 x i8> [ <i8 9, i8 10, i8 11, i8 12>, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE5]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP6:%.*]] = phi i8 [ 13, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE7]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP7:%.*]] = phi i8 [ 14, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE9]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP8:%.*]] = phi i8 [ undef, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE11]], [[ELSE]] ]
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE0:%.*]] = call <15 x i8> @llvm.vector.insert.v15i8.v4i8(<15 x i8> poison, <4 x i8> [[TMP3]], i64 0)
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE1:%.*]] = call <15 x i8> @llvm.vector.insert.v15i8.v4i8(<15 x i8> [[LARGEPHI_INSERTSLICE0]], <4 x i8> [[TMP4]], i64 4)
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE2:%.*]] = call <15 x i8> @llvm.vector.insert.v15i8.v4i8(<15 x i8> [[LARGEPHI_INSERTSLICE1]], <4 x i8> [[TMP5]], i64 8)
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE3:%.*]] = insertelement <15 x i8> [[LARGEPHI_INSERTSLICE2]], i8 [[TMP6]], i64 12
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE4:%.*]] = insertelement <15 x i8> [[LARGEPHI_INSERTSLICE3]], i8 [[TMP7]], i64 13
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE5:%.*]] = insertelement <15 x i8> [[LARGEPHI_INSERTSLICE4]], i8 [[TMP8]], i64 14
 ; OPT-NEXT:    store <15 x i8> [[LARGEPHI_INSERTSLICE5]], ptr [[OUT:%.*]], align 1
 ; OPT-NEXT:    ret void
 ;
@@ -664,6 +695,8 @@ define amdgpu_kernel void @phi_v15i8_random_constant_init(<15 x i8> %in, ptr %ou
 ; NOOPT-NEXT:    br label [[FINALLY:%.*]]
 ; NOOPT:       else:
 ; NOOPT-NEXT:    [[Y:%.*]] = insertelement <15 x i8> [[IN:%.*]], i8 64, i32 6
+; NOOPT-NEXT:    [[TMP0:%.*]] = shufflevector <15 x i8> [[Y]], <15 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; NOOPT-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
 ; NOOPT-NEXT:    br label [[FINALLY]]
 ; NOOPT:       finally:
 ; NOOPT-NEXT:    [[VAL:%.*]] = phi <15 x i8> [ <i8 poison, i8 1, i8 2, i8 3, i8 4, i8 undef, i8 6, i8 7, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 undef>, [[THEN]] ], [ [[Y]], [[ELSE]] ]
diff --git a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
index f78b408d782557..57179f8f26aec9 100644
--- a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
+++ b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
@@ -6,27 +6,31 @@ define amdgpu_kernel void @v3i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1)
 ; GFX906:       ; %bb.0: ; %entry
 ; GFX906-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX906-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 2, v0
+; GFX906-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; GFX906-NEXT:    v_mov_b32_e32 v3, 8
 ; GFX906-NEXT:    v_mov_b32_e32 v1, 0
-; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v0
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX906-NEXT:    global_load_dword v2, v5, s[4:5]
+; GFX906-NEXT:    global_load_dword v4, v2, s[4:5]
+; GFX906-NEXT:    s_mov_b32 s4, 0xff0000
+; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v0
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 8, v2
+; GFX906-NEXT:    v_lshrrev_b32_sdwa v5, v3, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v5, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX906-NEXT:    v_and_or_b32 v4, v4, s4, v5
 ; GFX906-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX906-NEXT:    s_cbranch_execz .LBB0_2
 ; GFX906-NEXT:  ; %bb.1: ; %bb.1
-; GFX906-NEXT:    global_load_dword v2, v5, s[6:7]
+; GFX906-NEXT:    global_load_dword v0, v2, s[6:7]
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 8, v2
+; GFX906-NEXT:    v_lshrrev_b32_sdwa v2, v3, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX906-NEXT:    v_and_or_b32 v4, v0, s4, v2
 ; GFX906-NEXT:  .LBB0_2: ; %bb.2
 ; GFX906-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v4
-; GFX906-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    global_store_byte v1, v3, s[2:3] offset:2
-; GFX906-NEXT:    global_store_short v1, v0, s[2:3]
+; GFX906-NEXT:    global_store_byte_d16_hi v1, v4, s[2:3] offset:2
+; GFX906-NEXT:    global_store_short v1, v4, s[2:3]
 ; GFX906-NEXT:    s_endpgm
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -50,31 +54,19 @@ define amdgpu_kernel void @v4i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1)
 ; GFX906:       ; %bb.0: ; %entry
 ; GFX906-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX906-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX906-NEXT:    v_lshlrev_b32_e32 v6, 2, v0
+; GFX906-NEXT:    v_lshlrev_b32_e32 v3, 2, v0
 ; GFX906-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v0
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX906-NEXT:    global_load_dword v2, v6, s[4:5]
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v2
-; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
-; GFX906-NEXT:    v_lshrrev_b32_e32 v5, 8, v2
+; GFX906-NEXT:    global_load_dword v2, v3, s[4:5]
 ; GFX906-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX906-NEXT:    s_cbranch_execz .LBB1_2
 ; GFX906-NEXT:  ; %bb.1: ; %bb.1
-; GFX906-NEXT:    global_load_dword v2, v6, s[6:7]
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v2
-; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
-; GFX906-NEXT:    v_lshrrev_b32_e32 v5, 8, v2
+; GFX906-NEXT:    global_load_dword v2, v3, s[6:7]
 ; GFX906-NEXT:  .LBB1_2: ; %bb.2
 ; GFX906-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v5
-; GFX906-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v3
-; GFX906-NEXT:    v_or_b32_sdwa v2, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    global_store_dword v1, v0, s[2:3]
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    global_store_dword v1, v2, s[2:3]
 ; GFX906-NEXT:    s_endpgm
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -98,32 +90,23 @@ define amdgpu_kernel void @v5i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1)
 ; GFX906:       ; %bb.0: ; %entry
 ; GFX906-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX906-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX906-NEXT:    v_lshlrev_b32_e32 v7, 3, v0
-; GFX906-NEXT:    v_mov_b32_e32 v5, 0
+; GFX906-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
+; GFX906-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v0
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX906-NEXT:    global_load_dwordx2 v[1:2], v7, s[4:5]
+; GFX906-NEXT:    global_load_dwordx2 v[1:2], v4, s[4:5]
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshrrev_b64 v[3:4], 24, v[1:2]
-; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
-; GFX906-NEXT:    v_lshrrev_b32_e32 v6, 8, v1
+; GFX906-NEXT:    v_and_b32_e32 v2, 0xff, v2
 ; GFX906-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX906-NEXT:    s_cbranch_execz .LBB2_2
 ; GFX906-NEXT:  ; %bb.1: ; %bb.1
-; GFX906-NEXT:    global_load_dwordx2 v[1:2], v7, s[6:7]
+; GFX906-NEXT:    global_load_dwordx2 v[1:2], v4, s[6:7]
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshrrev_b64 v[3:4], 24, v[1:2]
-; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
-; GFX906-NEXT:    v_lshrrev_b32_e32 v6, 8, v1
+; GFX906-NEXT:    v_and_b32_e32 v2, 0xff, v2
 ; GFX906-NEXT:  .LBB2_2: ; %bb.2
 ; GFX906-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v6
-; GFX906-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v3
-; GFX906-NEXT:    v_or_b32_sdwa v1, v4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    global_store_byte v5, v2, s[2:3] offset:4
-; GFX906-NEXT:    global_store_dword v5, v0, s[2:3]
+; GFX906-NEXT:    global_store_byte v3, v2, s[2:3] offset:4
+; GFX906-NEXT:    global_store_dword v3, v1, s[2:3]
 ; GFX906-NEXT:    s_endpgm
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -147,42 +130,19 @@ define amdgpu_kernel void @v8i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1)
 ; GFX906:       ; %bb.0: ; %entry
 ; GFX906-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX906-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX906-NEXT:    v_lshlrev_b32_e32 v10, 3, v0
+; GFX906-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
 ; GFX906-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v0
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX906-NEXT:    global_load_dwordx2 v[1:2], v10, s[4:5]
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 24, v2
-; GFX906-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
-; GFX906-NEXT:    v_lshrrev_b32_e32 v6, 8, v2
-; GFX906-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
-; GFX906-NEXT:    v_lshrrev_b32_e32 v8, 16, v1
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v1
+; GFX906-NEXT:    global_load_dwordx2 v[1:2], v4, s[4:5]
 ; GFX906-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX906-NEXT:    s_cbranch_execz .LBB3_2
 ; GFX906-NEXT:  ; %bb.1: ; %bb.1
-; GFX906-NEXT:    global_load_dwordx2 v[1:2], v10, s[6:7]
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 24, v2
-; GFX906-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
-; GFX906-NEXT:    v_lshrrev_b32_e32 v6, 8, v2
-; GFX906-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
-; GFX906-NEXT:    v_lshrrev_b32_e32 v8, 16, v1
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v1
+; GFX906-NEXT:    global_load_dwordx2 v[1:2], v4, s[6:7]
 ; GFX906-NEXT:  .LBB3_2: ; %bb.2
 ; GFX906-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v9
-; GFX906-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v7
-; GFX906-NEXT:    v_or_b32_sdwa v1, v8, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v6
-; GFX906-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v4
-; GFX906-NEXT:    v_or_b32_sdwa v2, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    global_store_dwordx2 v3, v[0:1], s[2:3]
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    global_store_dwordx2 v3, v[1:2], s[2:3]
 ; GFX906-NEXT:    s_endpgm
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -206,64 +166,19 @@ define amdgpu_kernel void @v16i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1
 ; GFX906:       ; %bb.0: ; %entry
 ; GFX906-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX906-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX906-NEXT:    v_lshlrev_b32_e32 v18, 4, v0
+; GFX906-NEXT:    v_lshlrev_b32_e32 v6, 4, v0
 ; GFX906-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v0
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX906-NEXT:    global_load_dwordx4 v[1:4], v18, s[4:5]
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v6, 24, v4
-; GFX906-NEXT:    v_lshrrev_b32_e32 v7, 16, v4
-; GFX906-NEXT:    v_lshrrev_b32_e32 v8, 8, v4
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v3
-; GFX906-NEXT:    v_lshrrev_b32_e32 v10, 16, v3
-; GFX906-NEXT:    v_lshrrev_b32_e32 v11, 8, v3
-; GFX906-NEXT:    v_lshrrev_b32_e32 v12, 24, v2
-; GFX906-NEXT:    v_lshrrev_b32_e32 v13, 16, v2
-; GFX906-NEXT:    v_lshrrev_b32_e32 v14, 8, v2
-; GFX906-NEXT:    v_lshrrev_b32_e32 v15, 24, v1
-; GFX906-NEXT:    v_lshrrev_b32_e32 v16, 16, v1
-; GFX906-NEXT:    v_lshrrev_b32_e32 v17, 8, v1
+; GFX906-NEXT:    global_load_dwordx4 v[1:4], v6, s[4:5]
 ; GFX906-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX906-NEXT:    s_cbranch_execz .LBB4_2
 ; GFX906-NEXT:  ; %bb.1: ; %bb.1
-; GFX906-NEXT:    global_load_dwordx4 v[1:4], v18, s[6:7]
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v6, 24, v4
-; GFX906-NEXT:    v_lshrrev_b32_e32 v7, 16, v4
-; GFX906-NEXT:    v_lshrrev_b32_e32 v8, 8, v4
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v3
-; GFX906-NEXT:    v_lshrrev_b32_e32 v10, 16, v3
-; GFX906-NEXT:    v_lshrrev_b32_e32 v11, 8, v3
-; GFX906-NEXT:    v_lshrrev_b32_e32 v12, 24, v2
-; GFX906-NEXT:    v_lshrrev_b32_e32 v13, 16, v2
-; GFX906-NEXT:    v_lshrrev_b32_e32 v14, 8, v2
-; GFX906-NEXT:    v_lshrrev_b32_e32 v15, 24, v1
-; GFX906-NEXT:    v_lshrrev_b32_e32 v16, 16, v1
-; GFX906-NEXT:    v_lshrrev_b32_e32 v17, 8, v1
+; GFX906-NEXT:    global_load_dwordx4 v[1:4], v6, s[6:7]
 ; GFX906-NEXT:  .LBB4_2: ; %bb.2
 ; GFX906-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v17
-; GFX906-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v15
-; GFX906-NEXT:    v_or_b32_sdwa v1, v16, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v14
-; GFX906-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v12
-; GFX906-NEXT:    v_or_b32_sdwa v2, v13, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v11
-; GFX906-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v9
-; GFX906-NEXT:    v_or_b32_sdwa v3, v10, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v8
-; GFX906-NEXT:    v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_lshlrev_b16_e32 v4, 8, v6
-; GFX906-NEXT:    v_or_b32_sdwa v4, v7, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    global_store_dwordx4 v5, v[0:3], s[2:3]
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    global_store_dwordx4 v5, v[1:4], s[2:3]
 ; GFX906-NEXT:    s_endpgm
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -286,114 +201,24 @@ define amdgpu_kernel void @v32i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1
 ; GFX906-LABEL: v32i8_liveout:
 ; GFX906:       ; %bb.0: ; %entry
 ; GFX906-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX906-NEXT:    v_lshlrev_b32_e32 v31, 5, v0
-; GFX906-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX906-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX906-NEXT:    v_lshlrev_b32_e32 v10, 5, v0
 ; GFX906-NEXT:    v_mov_b32_e32 v9, 0
 ; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v0
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX906-NEXT:    global_load_dwordx4 v[1:4], v31, s[4:5] offset:16
-; GFX906-NEXT:    global_load_dwordx4 v[5:8], v31, s[4:5]
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v4
-; GFX906-NEXT:    v_lshrrev_b32_e32 v10, 16, v4
-; GFX906-NEXT:    v_lshrrev_b32_e32 v11, 8, v4
-; GFX906-NEXT:    v_lshrrev_b32_e32 v12, 24, v3
-; GFX906-NEXT:    v_lshrrev_b32_e32 v13, 16, v3
-; GFX906-NEXT:    v_lshrrev_b32_e32 v14, 8, v3
-; GFX906-NEXT:    v_lshrrev_b32_e32 v15, 24, v2
-; GFX906-NEXT:    v_lshrrev_b32_e32 v16, 16, v2
-; GFX906-NEXT:    v_lshrrev_b32_e32 v17, 8, v2
-; GFX906-NEXT:    v_lshrrev_b32_e32 v18, 24, v1
-; GFX906-NEXT:    v_lshrrev_b32_e32 v19, 16, v1
-; GFX906-NEXT:    v_lshrrev_b32_e32 v20, 8, v1
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v21, 24, v8
-; GFX906-NEXT:    v_lshrrev_b32_e32 v22, 16, v8
-; GFX906-NEXT:    v_lshrrev_b32_e32 v23, 8, v8
-; GFX906-NEXT:    v_lshrrev_b32_e32 v24, 24, v7
-; GFX906-NEXT:    v_lshrrev_b32_e32 v25, 16, v7
-; GFX906-NEXT:    v_lshrrev_b32_e32 v26, 8, v7
-; GFX906-NEXT:    v_lshrrev_b32_e32 v27, 24, v6
-; GFX906-NEXT:    v_lshrrev_b32_e32 v28, 16, v6
-; GFX906-NEXT:    v_lshrrev_b32_e32 v29, 8, v6
-; GFX906-NEXT:    v_lshrrev_b32_e32 v30, 24, v5
-; GFX906-NEXT:    v_lshrrev_b32_e32 v32, 16, v5
-; GFX906-NEXT:    v_lshrrev_b32_e32 v33, 8, v5
-; GFX906-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX906-NEXT:    global_load_dwordx4 v[1:4], v10, s[4:5] offset:16
+; GFX906-NEXT:    global_load_dwordx4 v[5:8], v10, s[4:5]
+; GFX906-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX906-NEXT:    s_cbranch_execz .LBB5_2
 ; GFX906-NEXT:  ; %bb.1: ; %bb.1
-; GFX906-NEXT:    global_load_dwordx4 v[1:4], v31, s[6:7] offset:16
-; GFX906-NEXT:    global_load_dwordx4 v[5:8], v31, s[6:7]
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v4
-; GFX906-NEXT:    v_lshrrev_b32_e32 v10, 16, v4
-; GFX906-NEXT:    v_lshrrev_b32_e32 v11, 8, v4
-; GFX906-NEXT:    v_lshrrev_b32_e32 v12, 24, v3
-; GFX906-NEXT:    v_lshrrev_b32_e32 v13, 16, v3
-; GFX906-NEXT:    v_lshrrev_b32_e32 v14, 8, v3
-; GFX906-NEXT:    v_lshrrev_b32_e32 v15, 24, v2
-; GFX906-NEXT:    v_lshrrev_b32_e32 v16, 16, v2
-; GFX906-NEXT:    v_lshrrev_b32_e32 v17, 8, v2
-; GFX906-NEXT:    v_lshrrev_b32_e32 v18, 24, v1
-; GFX906-NEXT:    v_lshrrev_b32_e32 v19, 16, v1
-; GFX906-NEXT:    v_lshrrev_b32_e32 v20, 8, v1
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v21, 24, v8
-; GFX906-NEXT:    v_lshrrev_b32_e32 v22, 16, v8
-; GFX906-NEXT:    v_lshrrev_b32_e32 v23, 8, v8
-; GFX906-NEXT:    v_lshrrev_b32_e32 v24, 24, v7
-; GFX906-NEXT:    v_lshrrev_b32_e32 v25, 16, v7
-; GFX906-NEXT:    v_lshrrev_b32_e32 v26, 8, v7
-; GFX906-NEXT:    v_lshrrev_b32_e32 v27, 24, v6
-; GFX906-NEXT:    v_lshrrev_b32_e32 v28, 16, v6
-; GFX906-NEXT:    v_lshrrev_b32_e32 v29, 8, v6
-; GFX906-NEXT:    v_lshrrev_b32_e32 v30, 24, v5
-; GFX906-NEXT:    v_lshrrev_b32_e32 v32, 16, v5
-; GFX906-NEXT:    v_lshrrev_b32_e32 v33, 8, v5
+; GFX906-NEXT:    global_load_dwordx4 v[1:4], v10, s[6:7] offset:16
+; GFX906-NEXT:    global_load_dwordx4 v[5:8], v10, s[6:7]
 ; GFX906-NEXT:  .LBB5_2: ; %bb.2
-; GFX906-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX906-NEXT:    v_lshlrev_b16_e32 v30, 8, v30
-; GFX906-NEXT:    v_lshlrev_b16_e32 v31, 8, v33
-; GFX906-NEXT:    v_lshlrev_b16_e32 v29, 8, v29
-; GFX906-NEXT:    v_lshlrev_b16_e32 v27, 8, v27
-; GFX906-NEXT:    v_lshlrev_b16_e32 v26, 8, v26
-; GFX906-NEXT:    v_lshlrev_b16_e32 v24, 8, v24
-; GFX906-NEXT:    v_lshlrev_b16_e32 v23, 8, v23
-; GFX906-NEXT:    v_lshlrev_b16_e32 v21, 8, v21
-; GFX906-NEXT:    v_or_b32_sdwa v30, v32, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v5, v5, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v6, v6, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v27, v28, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v7, v7, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v24, v25, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v8, v8, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v21, v22, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v5, v5, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v6, v6, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v7, v7, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v8, v8, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    global_store_dwordx4 v9, v[5:8], s[0:1]
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX906-NEXT:    v_lshlrev_b16_e32 v5, 8, v20
-; GFX906-NEXT:    v_or_b32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_lshlrev_b16_e32 v5, 8, v18
-; GFX906-NEXT:    v_or_b32_sdwa v5, v19, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    v_lshlrev_b16_e32 v5, 8, v17
-; GFX906-NEXT:    v_or_b32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_lshlrev_b16_e32 v5, 8, v15
-; GFX906-NEXT:    v_or_b32_sdwa v5, v16, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    v_lshlrev_b16_e32 v5, 8, v14
-; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_lshlrev_b16_e32 v5, 8, v12
-; GFX906-NEXT:    v_or_b32_sdwa v5, v13, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    v_lshlrev_b16_e32 v5, 8, v11
-; GFX906-NEXT:    v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v0, v10, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v4, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    global_store_dwordx4 v9, v[1:4], s[0:1] offset:16
+; GFX906-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    global_store_dwordx4 v9, v[1:4], s[2:3] offset:16
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    global_store_dwordx4 v9, v[5:8], s[2:3]
 ; GFX906-NEXT:    s_endpgm
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -415,1555 +240,98 @@ bb.2:
 define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
 ; GFX906-LABEL: v256i8_liveout:
 ; GFX906:       ; %bb.0: ; %entry
+; GFX906-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX906-NEXT:    v_lshlrev_b32_e32 v61, 3, v0
 ; GFX906-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
 ; GFX906-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
 ; GFX906-NEXT:    s_mov_b32 s10, -1
+; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX906-NEXT:    global_load_dwordx4 v[5:8], v61, s[4:5] offset:240
 ; GFX906-NEXT:    s_mov_b32 s11, 0xe00000
 ; GFX906-NEXT:    s_add_u32 s8, s8, s3
-; GFX906-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX906-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX906-NEXT:    v_lshlrev_b32_e32 v63, 3, v0
 ; GFX906-NEXT:    s_addc_u32 s9, s9, 0
-; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX906-NEXT:    global_load_dwordx4 v[17:20], v63, s[4:5] offset:240
-; GFX906-NEXT:    global_load_dwordx4 v[5:8], v63, s[4:5] offset:224
-; GFX906-NEXT:    global_load_dwordx4 v[9:12], v63, s[4:5] offset:208
-; GFX906-NEXT:    global_load_dwordx4 v[13:16], v63, s[4:5] offset:192
 ; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v0
+; GFX906-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX906-NEXT:    v_mov_b32_e32 v4, 0
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v20
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:16 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v20
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:20 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v20
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:24 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v19
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:28 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v19
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:32 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v19
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:36 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v18
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:40 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v18
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:44 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v18
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:48 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v17
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:52 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v17
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:56 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v17, off, s[8:11], 0 ; 4-byte Folded Spill
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    buffer_store_dword v18, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v19, off, s[8:11], 0 offset:8 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v20, off, s[8:11], 0 offset:12 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v17
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:60 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v8
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:64 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v8
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:68 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v8
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:72 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v7
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:76 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v7
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:80 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v7
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:84 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v6
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:88 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v6
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:92 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v6
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:96 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v5
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:100 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v5
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:104 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v5
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:108 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v12
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:112 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v12
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:116 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v12
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:120 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v11
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:124 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v11
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:128 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v11
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:132 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v10
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:136 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v10
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:140 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v10
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:144 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v9
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:148 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v9
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:152 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v9
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:156 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v16
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:160 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v16
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:164 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v16
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:168 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v15
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:176 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v15
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:180 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v15
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:172 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v14
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:188 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v14
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:192 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v14
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:184 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v13
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:200 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v13
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:204 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v13
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:196 ; 4-byte Folded Spill
-; GFX906-NEXT:    global_load_dwordx4 v[17:20], v63, s[4:5] offset:176
-; GFX906-NEXT:    global_load_dwordx4 v[21:24], v63, s[4:5] offset:160
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v20
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:208 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v20
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:212 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v20
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:224 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v19
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:216 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v19
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:220 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v19
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:236 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v18
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:228 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v18
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:232 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v18
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:248 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v17
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:240 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v17
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:244 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v17
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:252 ; 4-byte Folded Spill
-; GFX906-NEXT:    s_waitcnt vmcnt(12)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v24
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:256 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v24
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:260 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v24
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:272 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v23
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:264 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v23
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:268 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v23
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:284 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v22
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:276 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v22
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:280 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v22
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:296 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v21
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:288 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v21
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:292 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v21
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:300 ; 4-byte Folded Spill
-; GFX906-NEXT:    global_load_dwordx4 v[25:28], v63, s[4:5] offset:144
-; GFX906-NEXT:    global_load_dwordx4 v[29:32], v63, s[4:5] offset:128
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v28
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:304 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v28
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:308 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v28
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:320 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v27
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:312 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v27
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:316 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v27
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:332 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v26
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:324 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v26
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:328 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v26
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:344 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v25
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:336 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v25
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:340 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v25
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:348 ; 4-byte Folded Spill
-; GFX906-NEXT:    s_waitcnt vmcnt(12)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v32
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:352 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v32
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:356 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v32
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:368 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v31
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:360 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v31
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:364 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v31
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:380 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v30
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:372 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v30
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:376 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v30
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:392 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v29
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:384 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v29
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:388 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v29
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:396 ; 4-byte Folded Spill
-; GFX906-NEXT:    global_load_dwordx4 v[33:36], v63, s[4:5] offset:112
-; GFX906-NEXT:    global_load_dwordx4 v[37:40], v63, s[4:5] offset:96
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v36
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:400 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v36
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:404 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v36
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:416 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v35
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:408 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v35
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:412 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v35
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:428 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v34
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:420 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v34
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:424 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v34
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:440 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v33
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:432 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v33
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:436 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v33
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:444 ; 4-byte Folded Spill
-; GFX906-NEXT:    s_waitcnt vmcnt(12)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v40
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:448 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v40
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:452 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v40
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:464 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v39
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:456 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v39
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:460 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v39
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:476 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v38
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:468 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v38
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:472 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v38
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:488 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v37
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:480 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v37
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:484 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v37
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:492 ; 4-byte Folded Spill
-; GFX906-NEXT:    global_load_dwordx4 v[41:44], v63, s[4:5] offset:80
-; GFX906-NEXT:    global_load_dwordx4 v[45:48], v63, s[4:5] offset:64
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v44
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:496 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v44
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:500 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v44
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:512 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v43
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:504 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v43
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:508 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v43
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:524 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v42
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:516 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v42
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:520 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v42
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:536 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v41
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:528 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v41
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:532 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v41
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:540 ; 4-byte Folded Spill
-; GFX906-NEXT:    s_waitcnt vmcnt(12)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v48
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:544 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v48
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:548 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v48
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:560 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v47
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:552 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v47
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:556 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v47
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:572 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v46
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:564 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v46
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:568 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v46
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:584 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v45
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:576 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v45
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:580 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v45
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:588 ; 4-byte Folded Spill
-; GFX906-NEXT:    global_load_dwordx4 v[49:52], v63, s[4:5] offset:48
-; GFX906-NEXT:    global_load_dwordx4 v[53:56], v63, s[4:5] offset:32
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v52
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:592 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v52
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:596 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v52
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:608 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v51
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:600 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v51
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:604 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v51
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:620 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v50
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:612 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v50
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:616 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v50
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:632 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v49
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:624 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v49
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:628 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v49
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:636 ; 4-byte Folded Spill
-; GFX906-NEXT:    s_waitcnt vmcnt(12)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v56
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:640 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v56
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:644 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v56
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:656 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v55
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:648 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v55
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:652 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v55
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:668 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v54
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:660 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v54
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:664 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v54
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:680 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v53
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:672 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v53
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:676 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v53
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:684 ; 4-byte Folded Spill
-; GFX906-NEXT:    global_load_dwordx4 v[57:60], v63, s[4:5] offset:16
+; GFX906-NEXT:    buffer_store_dword v5, off, s[8:11], 0 ; 4-byte Folded Spill
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    buffer_store_dword v6, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v7, off, s[8:11], 0 offset:8 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v8, off, s[8:11], 0 offset:12 ; 4-byte Folded Spill
+; GFX906-NEXT:    global_load_dwordx4 v[5:8], v61, s[4:5] offset:224
 ; GFX906-NEXT:    s_nop 0
-; GFX906-NEXT:    global_load_dwordx4 v[0:3], v63, s[4:5]
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 24, v60
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:688 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 16, v60
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:692 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 8, v60
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:704 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 24, v59
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:696 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 16, v59
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:700 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 8, v59
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:716 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 24, v58
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:708 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 16, v58
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:712 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 8, v58
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:728 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 24, v57
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:720 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 16, v57
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:724 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 8, v57
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:732 ; 4-byte Folded Spill
-; GFX906-NEXT:    s_waitcnt vmcnt(12)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 24, v3
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:736 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 16, v3
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:740 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 8, v3
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:752 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 24, v2
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:744 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 16, v2
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:748 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 8, v2
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:764 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 24, v1
-; GFX906-NEXT:    v_lshrrev_b32_e32 v62, 24, v0
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:756 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 16, v1
-; GFX906-NEXT:    buffer_store_dword v62, off, s[8:11], 0 offset:768 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v62, 16, v0
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:760 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 8, v1
-; GFX906-NEXT:    buffer_store_dword v62, off, s[8:11], 0 offset:772 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v62, 8, v0
-; GFX906-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX906-NEXT:    global_load_dwordx4 v[9:12], v61, s[4:5] offset:208
+; GFX906-NEXT:    global_load_dwordx4 v[13:16], v61, s[4:5] offset:192
+; GFX906-NEXT:    global_load_dwordx4 v[17:20], v61, s[4:5] offset:176
+; GFX906-NEXT:    global_load_dwordx4 v[21:24], v61, s[4:5] offset:160
+; GFX906-NEXT:    global_load_dwordx4 v[25:28], v61, s[4:5] offset:144
+; GFX906-NEXT:    global_load_dwordx4 v[29:32], v61, s[4:5] offset:128
+; GFX906-NEXT:    global_load_dwordx4 v[33:36], v61, s[4:5] offset:112
+; GFX906-NEXT:    global_load_dwordx4 v[37:40], v61, s[4:5] offset:96
+; GFX906-NEXT:    global_load_dwordx4 v[41:44], v61, s[4:5] offset:80
+; GFX906-NEXT:    global_load_dwordx4 v[45:48], v61, s[4:5] offset:64
+; GFX906-NEXT:    global_load_dwordx4 v[49:52], v61, s[4:5] offset:48
+; GFX906-NEXT:    global_load_dwordx4 v[53:56], v61, s[4:5] offset:32
+; GFX906-NEXT:    global_load_dwordx4 v[57:60], v61, s[4:5] offset:16
+; GFX906-NEXT:    global_load_dwordx4 v[0:3], v61, s[4:5]
+; GFX906-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX906-NEXT:    s_cbranch_execz .LBB6_2
 ; GFX906-NEXT:  ; %bb.1: ; %bb.1
-; GFX906-NEXT:    global_load_dwordx4 v[0:3], v63, s[6:7] offset:240
-; GFX906-NEXT:    global_load_dwordx4 v[5:8], v63, s[6:7] offset:224
-; GFX906-NEXT:    global_load_dwordx4 v[9:12], v63, s[6:7] offset:208
-; GFX906-NEXT:    global_load_dwordx4 v[13:16], v63, s[6:7] offset:192
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v17, 24, v3
-; GFX906-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:16 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v17, 16, v3
-; GFX906-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:20 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v17, 8, v3
-; GFX906-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:24 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v17, 24, v2
-; GFX906-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:28 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v17, 16, v2
-; GFX906-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:32 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v17, 8, v2
-; GFX906-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:36 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v17, 24, v1
-; GFX906-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:40 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v17, 16, v1
-; GFX906-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:44 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v17, 8, v1
-; GFX906-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:48 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v17, 24, v0
-; GFX906-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:52 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v17, 16, v0
-; GFX906-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:56 ; 4-byte Folded Spill
+; GFX906-NEXT:    global_load_dwordx4 v[0:3], v61, s[6:7] offset:240
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 ; 4-byte Folded Spill
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    buffer_store_dword v1, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill
 ; GFX906-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:8 ; 4-byte Folded Spill
 ; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:12 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:60 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v8
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:64 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v8
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:68 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v8
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:72 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v7
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:76 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v7
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:80 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v7
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:84 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v6
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:88 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v6
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:92 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v6
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:96 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v5
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:100 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v5
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:104 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v5
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:108 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v12
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:112 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v12
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:116 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v12
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:120 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v11
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:124 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v11
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:128 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v11
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:132 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v10
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:136 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v10
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:140 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v10
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:144 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v9
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:148 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v9
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:152 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v9
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:156 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v16
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:160 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v16
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:164 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v16
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:168 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v15
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:176 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v15
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:180 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v15
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:172 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v14
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:188 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v14
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:192 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v14
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:184 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v13
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:200 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v13
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:204 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v13
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:196 ; 4-byte Folded Spill
-; GFX906-NEXT:    global_load_dwordx4 v[17:20], v63, s[6:7] offset:176
-; GFX906-NEXT:    global_load_dwordx4 v[21:24], v63, s[6:7] offset:160
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v20
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:208 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v20
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:212 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v20
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:224 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v19
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:216 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v19
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:220 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v19
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:236 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v18
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:228 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v18
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:232 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v18
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:248 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v17
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:240 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v17
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:244 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v17
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:252 ; 4-byte Folded Spill
-; GFX906-NEXT:    s_waitcnt vmcnt(12)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v24
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:256 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v24
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:260 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v24
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:272 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v23
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:264 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v23
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:268 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v23
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:284 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v22
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:276 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v22
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:280 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v22
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:296 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v21
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:288 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v21
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:292 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v21
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:300 ; 4-byte Folded Spill
-; GFX906-NEXT:    global_load_dwordx4 v[25:28], v63, s[6:7] offset:144
-; GFX906-NEXT:    global_load_dwordx4 v[29:32], v63, s[6:7] offset:128
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v28
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:304 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v28
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:308 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v28
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:320 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v27
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:312 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v27
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:316 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v27
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:332 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v26
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:324 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v26
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:328 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v26
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:344 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v25
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:336 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v25
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:340 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v25
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:348 ; 4-byte Folded Spill
-; GFX906-NEXT:    s_waitcnt vmcnt(12)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v32
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:352 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v32
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:356 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v32
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:368 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v31
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:360 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v31
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:364 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v31
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:380 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v30
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:372 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v30
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:376 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v30
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:392 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v29
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:384 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v29
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:388 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v29
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:396 ; 4-byte Folded Spill
-; GFX906-NEXT:    global_load_dwordx4 v[33:36], v63, s[6:7] offset:112
-; GFX906-NEXT:    global_load_dwordx4 v[37:40], v63, s[6:7] offset:96
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v36
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:400 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v36
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:404 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v36
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:416 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v35
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:408 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v35
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:412 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v35
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:428 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v34
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:420 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v34
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:424 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v34
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:440 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v33
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:432 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v33
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:436 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v33
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:444 ; 4-byte Folded Spill
-; GFX906-NEXT:    s_waitcnt vmcnt(12)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v40
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:448 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v40
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:452 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v40
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:464 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v39
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:456 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v39
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:460 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v39
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:476 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v38
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:468 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v38
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:472 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v38
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:488 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v37
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:480 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v37
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:484 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v37
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:492 ; 4-byte Folded Spill
-; GFX906-NEXT:    global_load_dwordx4 v[41:44], v63, s[6:7] offset:80
-; GFX906-NEXT:    global_load_dwordx4 v[45:48], v63, s[6:7] offset:64
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v44
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:496 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v44
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:500 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v44
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:512 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v43
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:504 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v43
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:508 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v43
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:524 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v42
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:516 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v42
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:520 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v42
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:536 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v41
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:528 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v41
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:532 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v41
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:540 ; 4-byte Folded Spill
-; GFX906-NEXT:    s_waitcnt vmcnt(12)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v48
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:544 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v48
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:548 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v48
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:560 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v47
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:552 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v47
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:556 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v47
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:572 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v46
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:564 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v46
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:568 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v46
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:584 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v45
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:576 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v45
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:580 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v45
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:588 ; 4-byte Folded Spill
-; GFX906-NEXT:    global_load_dwordx4 v[49:52], v63, s[6:7] offset:48
-; GFX906-NEXT:    global_load_dwordx4 v[53:56], v63, s[6:7] offset:32
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v52
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:592 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v52
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:596 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v52
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:608 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v51
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:600 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v51
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:604 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v51
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:620 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v50
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:612 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v50
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:616 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v50
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:632 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v49
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:624 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v49
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:628 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v49
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:636 ; 4-byte Folded Spill
-; GFX906-NEXT:    s_waitcnt vmcnt(12)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v56
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:640 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v56
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:644 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v56
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:656 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v55
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:648 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v55
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:652 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v55
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:668 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v54
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:660 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v54
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:664 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v54
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:680 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v53
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:672 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v53
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:676 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v53
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:684 ; 4-byte Folded Spill
-; GFX906-NEXT:    global_load_dwordx4 v[57:60], v63, s[6:7] offset:16
-; GFX906-NEXT:    s_nop 0
-; GFX906-NEXT:    global_load_dwordx4 v[0:3], v63, s[6:7]
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 24, v60
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:688 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 16, v60
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:692 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 8, v60
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:704 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 24, v59
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:696 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 16, v59
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:700 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 8, v59
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:716 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 24, v58
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:708 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 16, v58
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:712 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 8, v58
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:728 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 24, v57
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:720 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 16, v57
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:724 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 8, v57
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:732 ; 4-byte Folded Spill
-; GFX906-NEXT:    s_waitcnt vmcnt(12)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 24, v3
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:736 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 16, v3
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:740 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 8, v3
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:752 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 24, v2
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:744 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 16, v2
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:748 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 8, v2
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:764 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 24, v1
-; GFX906-NEXT:    v_lshrrev_b32_e32 v62, 24, v0
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:756 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 16, v1
-; GFX906-NEXT:    buffer_store_dword v62, off, s[8:11], 0 offset:768 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v62, 16, v0
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:760 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 8, v1
-; GFX906-NEXT:    buffer_store_dword v62, off, s[8:11], 0 offset:772 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v62, 8, v0
+; GFX906-NEXT:    global_load_dwordx4 v[5:8], v61, s[6:7] offset:224
+; GFX906-NEXT:    global_load_dwordx4 v[9:12], v61, s[6:7] offset:208
+; GFX906-NEXT:    global_load_dwordx4 v[13:16], v61, s[6:7] offset:192
+; GFX906-NEXT:    global_load_dwordx4 v[17:20], v61, s[6:7] offset:176
+; GFX906-NEXT:    global_load_dwordx4 v[21:24], v61, s[6:7] offset:160
+; GFX906-NEXT:    global_load_dwordx4 v[25:28], v61, s[6:7] offset:144
+; GFX906-NEXT:    global_load_dwordx4 v[29:32], v61, s[6:7] offset:128
+; GFX906-NEXT:    global_load_dwordx4 v[33:36], v61, s[6:7] offset:112
+; GFX906-NEXT:    global_load_dwordx4 v[37:40], v61, s[6:7] offset:96
+; GFX906-NEXT:    global_load_dwordx4 v[41:44], v61, s[6:7] offset:80
+; GFX906-NEXT:    global_load_dwordx4 v[45:48], v61, s[6:7] offset:64
+; GFX906-NEXT:    global_load_dwordx4 v[49:52], v61, s[6:7] offset:48
+; GFX906-NEXT:    global_load_dwordx4 v[53:56], v61, s[6:7] offset:32
+; GFX906-NEXT:    global_load_dwordx4 v[57:60], v61, s[6:7] offset:16
+; GFX906-NEXT:    global_load_dwordx4 v[0:3], v61, s[6:7]
 ; GFX906-NEXT:  .LBB6_2: ; %bb.2
-; GFX906-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX906-NEXT:    v_lshlrev_b16_e32 v61, 8, v61
-; GFX906-NEXT:    v_or_b32_sdwa v1, v1, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v61, off, s[8:11], 0 offset:764 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_lshlrev_b16_e32 v62, 8, v62
-; GFX906-NEXT:    v_or_b32_sdwa v0, v0, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v62, off, s[8:11], 0 offset:772 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v63, off, s[8:11], 0 offset:760 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v61, 8, v61
-; GFX906-NEXT:    v_or_b32_sdwa v2, v2, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v61, off, s[8:11], 0 offset:752 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v61, 8, v61
-; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v61, off, s[8:11], 0 offset:768 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v61, 8, v61
-; GFX906-NEXT:    v_or_b32_sdwa v61, v62, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v62, off, s[8:11], 0 offset:756 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_or_b32_sdwa v0, v0, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v61, off, s[8:11], 0 offset:744 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v62, 8, v62
-; GFX906-NEXT:    v_or_b32_sdwa v62, v63, v62 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v1, v1, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v62, off, s[8:11], 0 offset:748 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v61, 8, v61
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_or_b32_sdwa v61, v62, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v2, v2, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v61, off, s[8:11], 0 offset:736 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v62, off, s[8:11], 0 offset:740 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v61, 8, v61
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_or_b32_sdwa v61, v62, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:732 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_nop 0
-; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:728 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:716 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:704 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX906-NEXT:    v_or_b32_sdwa v0, v57, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; GFX906-NEXT:    buffer_load_dword v57, off, s[8:11], 0 offset:720 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_or_b32_sdwa v1, v58, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v58, off, s[8:11], 0 offset:724 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    v_or_b32_sdwa v2, v59, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v59, off, s[8:11], 0 offset:712 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v57, 8, v57
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_or_b32_sdwa v57, v58, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v58, off, s[8:11], 0 offset:708 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_or_b32_sdwa v0, v0, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v57, off, s[8:11], 0 offset:696 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v58, 8, v58
-; GFX906-NEXT:    v_or_b32_sdwa v58, v59, v58 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v1, v1, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v58, off, s[8:11], 0 offset:700 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v57, 8, v57
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_or_b32_sdwa v57, v58, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v2, v2, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v57, off, s[8:11], 0 offset:688 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v58, off, s[8:11], 0 offset:692 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
-; GFX906-NEXT:    v_or_b32_sdwa v3, v60, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v57, 8, v57
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_or_b32_sdwa v57, v58, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:16
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:684 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_nop 0
-; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:680 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:668 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:656 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX906-NEXT:    v_or_b32_sdwa v0, v53, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; GFX906-NEXT:    buffer_load_dword v53, off, s[8:11], 0 offset:672 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_or_b32_sdwa v1, v54, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v54, off, s[8:11], 0 offset:676 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    v_or_b32_sdwa v2, v55, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v55, off, s[8:11], 0 offset:664 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v53, 8, v53
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_or_b32_sdwa v53, v54, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v54, off, s[8:11], 0 offset:660 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_or_b32_sdwa v0, v0, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v53, off, s[8:11], 0 offset:648 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v54, 8, v54
-; GFX906-NEXT:    v_or_b32_sdwa v54, v55, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v1, v1, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v54, off, s[8:11], 0 offset:652 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v53, 8, v53
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_or_b32_sdwa v53, v54, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v2, v2, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v53, off, s[8:11], 0 offset:640 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v54, off, s[8:11], 0 offset:644 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
-; GFX906-NEXT:    v_or_b32_sdwa v3, v56, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v53, 8, v53
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_or_b32_sdwa v53, v54, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:32
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:636 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_nop 0
-; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:632 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:620 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:608 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX906-NEXT:    v_or_b32_sdwa v0, v49, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; GFX906-NEXT:    buffer_load_dword v49, off, s[8:11], 0 offset:624 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_or_b32_sdwa v1, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v50, off, s[8:11], 0 offset:628 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    v_or_b32_sdwa v2, v51, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v51, off, s[8:11], 0 offset:616 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v49, 8, v49
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_or_b32_sdwa v49, v50, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v50, off, s[8:11], 0 offset:612 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_or_b32_sdwa v0, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v49, off, s[8:11], 0 offset:600 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v50, 8, v50
-; GFX906-NEXT:    v_or_b32_sdwa v50, v51, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v1, v1, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v50, off, s[8:11], 0 offset:604 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v49, 8, v49
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_or_b32_sdwa v49, v50, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v2, v2, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v49, off, s[8:11], 0 offset:592 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v50, off, s[8:11], 0 offset:596 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
-; GFX906-NEXT:    v_or_b32_sdwa v3, v52, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v49, 8, v49
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_or_b32_sdwa v49, v50, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:48
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:588 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_nop 0
-; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:584 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:572 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:560 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX906-NEXT:    v_or_b32_sdwa v0, v45, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; GFX906-NEXT:    buffer_load_dword v45, off, s[8:11], 0 offset:576 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_or_b32_sdwa v1, v46, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v46, off, s[8:11], 0 offset:580 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    v_or_b32_sdwa v2, v47, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v47, off, s[8:11], 0 offset:568 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v45, 8, v45
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_or_b32_sdwa v45, v46, v45 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v46, off, s[8:11], 0 offset:564 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_or_b32_sdwa v0, v0, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v45, off, s[8:11], 0 offset:552 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v46, 8, v46
-; GFX906-NEXT:    v_or_b32_sdwa v46, v47, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v1, v1, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v46, off, s[8:11], 0 offset:556 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v45, 8, v45
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_or_b32_sdwa v45, v46, v45 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v2, v2, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v45, off, s[8:11], 0 offset:544 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v46, off, s[8:11], 0 offset:548 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
-; GFX906-NEXT:    v_or_b32_sdwa v3, v48, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v45, 8, v45
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_or_b32_sdwa v45, v46, v45 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:64
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:540 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_nop 0
-; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:536 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:524 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:512 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX906-NEXT:    v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; GFX906-NEXT:    buffer_load_dword v41, off, s[8:11], 0 offset:528 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_or_b32_sdwa v1, v42, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v42, off, s[8:11], 0 offset:532 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    v_or_b32_sdwa v2, v43, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v43, off, s[8:11], 0 offset:520 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v41, 8, v41
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_or_b32_sdwa v41, v42, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v42, off, s[8:11], 0 offset:516 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_or_b32_sdwa v0, v0, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v41, off, s[8:11], 0 offset:504 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v42, 8, v42
-; GFX906-NEXT:    v_or_b32_sdwa v42, v43, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v1, v1, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v42, off, s[8:11], 0 offset:508 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v41, 8, v41
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_or_b32_sdwa v41, v42, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v2, v2, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v41, off, s[8:11], 0 offset:496 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v42, off, s[8:11], 0 offset:500 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
-; GFX906-NEXT:    v_or_b32_sdwa v3, v44, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v41, 8, v41
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_or_b32_sdwa v41, v42, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:80
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:492 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_nop 0
-; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:488 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:476 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:464 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX906-NEXT:    v_or_b32_sdwa v0, v37, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; GFX906-NEXT:    buffer_load_dword v37, off, s[8:11], 0 offset:480 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_or_b32_sdwa v1, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v38, off, s[8:11], 0 offset:484 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    v_or_b32_sdwa v2, v39, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v39, off, s[8:11], 0 offset:472 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v37, 8, v37
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_or_b32_sdwa v37, v38, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v38, off, s[8:11], 0 offset:468 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_or_b32_sdwa v0, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v37, off, s[8:11], 0 offset:456 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v38, 8, v38
-; GFX906-NEXT:    v_or_b32_sdwa v38, v39, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v1, v1, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v38, off, s[8:11], 0 offset:460 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v37, 8, v37
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_or_b32_sdwa v37, v38, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v2, v2, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v37, off, s[8:11], 0 offset:448 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v38, off, s[8:11], 0 offset:452 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
-; GFX906-NEXT:    v_or_b32_sdwa v3, v40, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v37, 8, v37
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_or_b32_sdwa v37, v38, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:96
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:444 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_nop 0
-; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:440 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:428 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:416 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX906-NEXT:    v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; GFX906-NEXT:    buffer_load_dword v33, off, s[8:11], 0 offset:432 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_or_b32_sdwa v1, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v34, off, s[8:11], 0 offset:436 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    v_or_b32_sdwa v2, v35, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v35, off, s[8:11], 0 offset:424 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v33, 8, v33
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_or_b32_sdwa v33, v34, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v34, off, s[8:11], 0 offset:420 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_or_b32_sdwa v0, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v33, off, s[8:11], 0 offset:408 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v34, 8, v34
-; GFX906-NEXT:    v_or_b32_sdwa v34, v35, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v1, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v34, off, s[8:11], 0 offset:412 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v33, 8, v33
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_or_b32_sdwa v33, v34, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v2, v2, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v33, off, s[8:11], 0 offset:400 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v34, off, s[8:11], 0 offset:404 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
-; GFX906-NEXT:    v_or_b32_sdwa v3, v36, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v33, 8, v33
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_or_b32_sdwa v33, v34, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:112
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:396 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_nop 0
-; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:392 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:380 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:368 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX906-NEXT:    v_or_b32_sdwa v0, v29, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; GFX906-NEXT:    buffer_load_dword v29, off, s[8:11], 0 offset:384 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_or_b32_sdwa v1, v30, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v30, off, s[8:11], 0 offset:388 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    v_or_b32_sdwa v2, v31, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v31, off, s[8:11], 0 offset:376 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v29, 8, v29
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_or_b32_sdwa v29, v30, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v30, off, s[8:11], 0 offset:372 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_or_b32_sdwa v0, v0, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v29, off, s[8:11], 0 offset:360 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v30, 8, v30
-; GFX906-NEXT:    v_or_b32_sdwa v30, v31, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v1, v1, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v30, off, s[8:11], 0 offset:364 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v29, 8, v29
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_or_b32_sdwa v29, v30, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v2, v2, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v29, off, s[8:11], 0 offset:352 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v30, off, s[8:11], 0 offset:356 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
-; GFX906-NEXT:    v_or_b32_sdwa v3, v32, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v29, 8, v29
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_or_b32_sdwa v29, v30, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:128
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:348 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_nop 0
-; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:344 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:332 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:320 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX906-NEXT:    v_or_b32_sdwa v0, v25, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; GFX906-NEXT:    buffer_load_dword v25, off, s[8:11], 0 offset:336 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_or_b32_sdwa v1, v26, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v26, off, s[8:11], 0 offset:340 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    v_or_b32_sdwa v2, v27, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v27, off, s[8:11], 0 offset:328 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v25, 8, v25
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_or_b32_sdwa v25, v26, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v26, off, s[8:11], 0 offset:324 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_or_b32_sdwa v0, v0, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v25, off, s[8:11], 0 offset:312 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v26, 8, v26
-; GFX906-NEXT:    v_or_b32_sdwa v26, v27, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v1, v1, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v26, off, s[8:11], 0 offset:316 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v25, 8, v25
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_or_b32_sdwa v25, v26, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v2, v2, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v25, off, s[8:11], 0 offset:304 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v26, off, s[8:11], 0 offset:308 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
-; GFX906-NEXT:    v_or_b32_sdwa v3, v28, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v25, 8, v25
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_or_b32_sdwa v25, v26, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:144
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:300 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_nop 0
-; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:296 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:284 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:272 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX906-NEXT:    v_or_b32_sdwa v0, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; GFX906-NEXT:    buffer_load_dword v21, off, s[8:11], 0 offset:288 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_or_b32_sdwa v1, v22, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v22, off, s[8:11], 0 offset:292 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    v_or_b32_sdwa v2, v23, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v23, off, s[8:11], 0 offset:280 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v21, 8, v21
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_or_b32_sdwa v21, v22, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v22, off, s[8:11], 0 offset:276 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_or_b32_sdwa v0, v0, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v21, off, s[8:11], 0 offset:264 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v22, 8, v22
-; GFX906-NEXT:    v_or_b32_sdwa v22, v23, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v1, v1, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v22, off, s[8:11], 0 offset:268 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v21, 8, v21
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_or_b32_sdwa v21, v22, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v2, v2, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v21, off, s[8:11], 0 offset:256 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v22, off, s[8:11], 0 offset:260 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
-; GFX906-NEXT:    v_or_b32_sdwa v3, v24, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v21, 8, v21
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_or_b32_sdwa v21, v22, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:160
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:252 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_nop 0
-; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:248 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:236 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:224 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX906-NEXT:    v_or_b32_sdwa v0, v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; GFX906-NEXT:    buffer_load_dword v17, off, s[8:11], 0 offset:240 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_or_b32_sdwa v1, v18, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v18, off, s[8:11], 0 offset:244 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    v_or_b32_sdwa v2, v19, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v19, off, s[8:11], 0 offset:232 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v17, 8, v17
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v18, off, s[8:11], 0 offset:228 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_or_b32_sdwa v0, v0, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v17, off, s[8:11], 0 offset:216 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v18, 8, v18
-; GFX906-NEXT:    v_or_b32_sdwa v18, v19, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v1, v1, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v18, off, s[8:11], 0 offset:220 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v17, 8, v17
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v2, v2, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v17, off, s[8:11], 0 offset:208 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v18, off, s[8:11], 0 offset:212 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
-; GFX906-NEXT:    v_or_b32_sdwa v3, v20, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v17, 8, v17
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:176
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:200 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_nop 0
-; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:204 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:192 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:184 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:188 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
-; GFX906-NEXT:    v_or_b32_sdwa v3, v14, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v14, off, s[8:11], 0 offset:164 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; GFX906-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:196 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:180 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    v_or_b32_sdwa v2, v13, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:176 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v13, off, s[8:11], 0 offset:160 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:172 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v13, 8, v13
-; GFX906-NEXT:    v_or_b32_sdwa v13, v14, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
-; GFX906-NEXT:    v_or_b32_sdwa v3, v15, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:168 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
-; GFX906-NEXT:    v_or_b32_sdwa v3, v16, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:192
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:156 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_nop 0
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:152 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:148 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:140 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX906-NEXT:    v_or_b32_sdwa v0, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v9, off, s[8:11], 0 offset:128 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; GFX906-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:144 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:136 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:132 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:124 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v10, off, s[8:11], 0 offset:116 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
-; GFX906-NEXT:    v_or_b32_sdwa v2, v11, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v3, v9, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:120 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v9, off, s[8:11], 0 offset:112 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v9, 8, v9
-; GFX906-NEXT:    v_or_b32_sdwa v3, v12, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v9, v10, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:208
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:108 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_nop 0
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:104 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:100 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:92 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX906-NEXT:    v_or_b32_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:80 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; GFX906-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:96 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:88 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:84 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:76 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:68 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
-; GFX906-NEXT:    v_or_b32_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:72 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:64 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v5, 8, v5
-; GFX906-NEXT:    v_or_b32_sdwa v3, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:224
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:60 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_nop 0
-; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:4 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v7, off, s[8:11], 0 offset:8 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v8, off, s[8:11], 0 offset:12 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:52 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:56 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:44 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX906-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(0)
+; GFX906-NEXT:    global_store_dwordx4 v4, v[33:36], s[0:1] offset:112
 ; GFX906-NEXT:    s_waitcnt vmcnt(7)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_or_b32_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:48 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:40 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:32 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:36 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:28 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:20 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
-; GFX906-NEXT:    v_or_b32_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:24 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:16 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v5, 8, v5
-; GFX906-NEXT:    v_or_b32_sdwa v3, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:240
+; GFX906-NEXT:    global_store_dwordx4 v4, v[37:40], s[0:1] offset:96
+; GFX906-NEXT:    s_waitcnt vmcnt(7)
+; GFX906-NEXT:    global_store_dwordx4 v4, v[41:44], s[0:1] offset:80
+; GFX906-NEXT:    s_waitcnt vmcnt(7)
+; GFX906-NEXT:    global_store_dwordx4 v4, v[45:48], s[0:1] offset:64
+; GFX906-NEXT:    s_waitcnt vmcnt(7)
+; GFX906-NEXT:    global_store_dwordx4 v4, v[49:52], s[0:1] offset:48
+; GFX906-NEXT:    s_waitcnt vmcnt(7)
+; GFX906-NEXT:    global_store_dwordx4 v4, v[53:56], s[0:1] offset:32
+; GFX906-NEXT:    s_waitcnt vmcnt(7)
+; GFX906-NEXT:    global_store_dwordx4 v4, v[57:60], s[0:1] offset:16
+; GFX906-NEXT:    s_waitcnt vmcnt(7)
+; GFX906-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_nop 0
+; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:4 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:8 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:12 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] offset:240
+; GFX906-NEXT:    global_store_dwordx4 v4, v[5:8], s[0:1] offset:224
+; GFX906-NEXT:    global_store_dwordx4 v4, v[9:12], s[0:1] offset:208
+; GFX906-NEXT:    global_store_dwordx4 v4, v[13:16], s[0:1] offset:192
+; GFX906-NEXT:    global_store_dwordx4 v4, v[17:20], s[0:1] offset:176
+; GFX906-NEXT:    global_store_dwordx4 v4, v[21:24], s[0:1] offset:160
+; GFX906-NEXT:    global_store_dwordx4 v4, v[25:28], s[0:1] offset:144
+; GFX906-NEXT:    global_store_dwordx4 v4, v[29:32], s[0:1] offset:128
 ; GFX906-NEXT:    s_endpgm
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()

>From 8d082d239eb8d064fedefad16862c34b51282d89 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Wed, 21 Feb 2024 08:48:41 -0800
Subject: [PATCH 02/11] Handle loop edge in PHI nodes + Port to
 LateCodegenPrepare + Move LateCodeGenPrepare after CodeSinking + Integrate
 the loops

Change-Id: Iac0baf0ab9e523bf303585b545f060293e6fb4f0
---
 .../Target/AMDGPU/AMDGPUCodeGenPrepare.cpp    | 338 -----------------
 .../AMDGPU/AMDGPULateCodeGenPrepare.cpp       | 356 +++++++++++++++++-
 .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp |   4 +-
 .../amdgpu-codegenprepare-break-large-phis.ll | 133 +++----
 llvm/test/CodeGen/AMDGPU/llc-pipeline.ll      |  24 +-
 llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll     |  24 +-
 llvm/test/CodeGen/AMDGPU/vni8-loop-carried.ll |  46 +++
 7 files changed, 472 insertions(+), 453 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/vni8-loop-carried.ll

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index 8f1dd1c522b04c..a37302d3c41267 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -344,85 +344,6 @@ class AMDGPUCodeGenPrepare : public FunctionPass {
   StringRef getPassName() const override { return "AMDGPU IR optimizations"; }
 };
 
-class LiveRegConversion {
-private:
-  // The instruction which defined the original virtual register used across
-  // blocks
-  Instruction *LiveRegDef;
-  // The original type
-  Type *OriginalType;
-  // The desired type
-  Type *NewType;
-  // The instruction sequence that converts the virtual register, to be used
-  // instead of the original
-  std::optional<Instruction *> Converted;
-  // The builder used to build the conversion instruction
-  IRBuilder<> ConvertBuilder;
-
-public:
-  // The instruction which defined the original virtual register used across
-  // blocks
-  Instruction *getLiveRegDef() { return LiveRegDef; }
-  // The original type
-  Type *getOriginalType() { return OriginalType; }
-  // The desired type
-  Type *getNewType() { return NewType; }
-  void setNewType(Type *NewType) { this->NewType = NewType; }
-  // The instruction that conerts the virtual register, to be used instead of
-  // the original
-  std::optional<Instruction *> &getConverted() { return Converted; }
-  void setConverted(Instruction *Converted) { this->Converted = Converted; }
-  // The builder used to build the conversion instruction
-  IRBuilder<> &getConverBuilder() { return ConvertBuilder; }
-  // Do we have a instruction sequence which convert the original virtual
-  // register
-  bool hasConverted() { return Converted.has_value(); }
-
-  LiveRegConversion(Instruction *LiveRegDef, BasicBlock *InsertBlock,
-                    BasicBlock::iterator InsertPt)
-      : LiveRegDef(LiveRegDef), OriginalType(LiveRegDef->getType()),
-        ConvertBuilder(InsertBlock, InsertPt) {}
-  LiveRegConversion(Instruction *LiveRegDef, Type *NewType,
-                    BasicBlock *InsertBlock, BasicBlock::iterator InsertPt)
-      : LiveRegDef(LiveRegDef), OriginalType(LiveRegDef->getType()),
-        NewType(NewType), ConvertBuilder(InsertBlock, InsertPt) {}
-};
-
-class LiveRegOptimizer {
-private:
-  Module *Mod = nullptr;
-  // The scalar type to convert to
-  Type *ConvertToScalar;
-  // Holds the collection of PHIs with their pending new operands
-  SmallVector<std::pair<Instruction *,
-                        SmallVector<std::pair<Instruction *, BasicBlock *>, 4>>,
-              4>
-      PHIUpdater;
-
-public:
-  // Should the def of the instruction be converted if it is live across blocks
-  bool shouldReplaceUses(const Instruction &I);
-  // Convert the virtual register to the compatible vector of legal type
-  void convertToOptType(LiveRegConversion &LR);
-  // Convert the virtual register back to the original type, stripping away
-  // the MSBs in cases where there was an imperfect fit (e.g. v2i32 -> v7i8)
-  void convertFromOptType(LiveRegConversion &LR);
-  // Get a vector of desired scalar type that is compatible with the original
-  // vector. In cases where there is no bitsize equivalent using a legal vector
-  // type, we pad the MSBs (e.g. v7i8 -> v2i32)
-  Type *getCompatibleType(Instruction *InstToConvert);
-  // Find and replace uses of the virtual register in different block with a
-  // newly produced virtual register of legal type
-  bool replaceUses(Instruction &I);
-  // Replace the collected PHIs with newly produced incoming values. Replacement
-  // is only done if we have a replacement for each original incoming value.
-  bool replacePHIs();
-
-  LiveRegOptimizer(Module *Mod) : Mod(Mod) {
-    ConvertToScalar = Type::getInt32Ty(Mod->getContext());
-  }
-};
-
 } // end anonymous namespace
 
 bool AMDGPUCodeGenPrepareImpl::run(Function &F) {
@@ -453,268 +374,9 @@ bool AMDGPUCodeGenPrepareImpl::run(Function &F) {
     }
   }
 
-  // GlobalISel should directly use the values, and do not need to emit
-  // CopyTo/CopyFrom Regs across blocks
-  if (UsesGlobalISel)
-    return MadeChange;
-
-  // "Optimize" the virtual regs that cross basic block boundaries. In such
-  // cases, vectors of illegal types will be scalarized and widened, with each
-  // scalar living in its own physical register. The optimization converts the
-  // vectors to equivalent vectors of legal type (which are convereted back
-  // before uses in subsequenmt blocks), to pack the bits into fewer physical
-  // registers (used in CopyToReg/CopyFromReg pairs).
-  LiveRegOptimizer LRO(Mod);
-  for (auto &BB : F) {
-    for (auto &I : BB) {
-      if (!LRO.shouldReplaceUses(I))
-        continue;
-      MadeChange |= LRO.replaceUses(I);
-    }
-  }
-
-  MadeChange |= LRO.replacePHIs();
   return MadeChange;
 }
 
-bool LiveRegOptimizer::replaceUses(Instruction &I) {
-  bool MadeChange = false;
-
-  struct ConvertUseInfo {
-    Instruction *Converted;
-    SmallVector<Instruction *, 4> Users;
-  };
-  DenseMap<BasicBlock *, ConvertUseInfo> UseConvertTracker;
-
-  LiveRegConversion FromLRC(
-      &I, I.getParent(),
-      static_cast<BasicBlock::iterator>(std::next(I.getIterator())));
-  FromLRC.setNewType(getCompatibleType(FromLRC.getLiveRegDef()));
-  for (auto IUser = I.user_begin(); IUser != I.user_end(); IUser++) {
-
-    if (auto UserInst = dyn_cast<Instruction>(*IUser)) {
-      if (UserInst->getParent() != I.getParent()) {
-        LLVM_DEBUG(dbgs() << *UserInst << "\n\tUses "
-                          << *FromLRC.getOriginalType()
-                          << " from previous block. Needs conversion\n");
-        convertToOptType(FromLRC);
-        if (!FromLRC.hasConverted())
-          continue;
-        // If it is a PHI node, just create and collect the new operand. We can
-        // only replace the PHI node once we have converted all the operands
-        if (auto PhiInst = dyn_cast<PHINode>(UserInst)) {
-          for (unsigned Idx = 0; Idx < PhiInst->getNumIncomingValues(); Idx++) {
-            auto IncVal = PhiInst->getIncomingValue(Idx);
-            if (&I == dyn_cast<Instruction>(IncVal)) {
-              auto IncBlock = PhiInst->getIncomingBlock(Idx);
-              auto PHIOps = find_if(
-                  PHIUpdater,
-                  [&UserInst](
-                      std::pair<Instruction *,
-                                SmallVector<
-                                    std::pair<Instruction *, BasicBlock *>, 4>>
-                          &Entry) { return Entry.first == UserInst; });
-
-              if (PHIOps == PHIUpdater.end())
-                PHIUpdater.push_back(
-                    {UserInst, {{*FromLRC.getConverted(), IncBlock}}});
-              else
-                PHIOps->second.push_back({*FromLRC.getConverted(), IncBlock});
-
-              break;
-            }
-          }
-          continue;
-        }
-
-        // Do not create multiple conversion sequences if there are multiple
-        // uses in the same block
-        if (UseConvertTracker.contains(UserInst->getParent())) {
-          UseConvertTracker[UserInst->getParent()].Users.push_back(UserInst);
-          LLVM_DEBUG(dbgs() << "\tUser already has access to converted def\n");
-          continue;
-        }
-
-        LiveRegConversion ToLRC(*FromLRC.getConverted(), I.getType(),
-                                UserInst->getParent(),
-                                static_cast<BasicBlock::iterator>(
-                                    UserInst->getParent()->getFirstNonPHIIt()));
-        convertFromOptType(ToLRC);
-        assert(ToLRC.hasConverted());
-        UseConvertTracker[UserInst->getParent()] = {*ToLRC.getConverted(),
-                                                    {UserInst}};
-      }
-    }
-  }
-
-  // Replace uses of with in a separate loop that is not dependent upon the
-  // state of the uses
-  for (auto &Entry : UseConvertTracker) {
-    for (auto &UserInst : Entry.second.Users) {
-      LLVM_DEBUG(dbgs() << *UserInst
-                        << "\n\tNow uses: " << *Entry.second.Converted << "\n");
-      UserInst->replaceUsesOfWith(&I, Entry.second.Converted);
-      MadeChange = true;
-    }
-  }
-  return MadeChange;
-}
-
-bool LiveRegOptimizer::replacePHIs() {
-  bool MadeChange = false;
-  for (auto Ele : PHIUpdater) {
-    auto ThePHINode = dyn_cast<PHINode>(Ele.first);
-    assert(ThePHINode);
-    auto NewPHINodeOps = Ele.second;
-    LLVM_DEBUG(dbgs() << "Attempting to replace: " << *ThePHINode << "\n");
-    // If we have conveted all the required operands, then do the replacement
-    if (ThePHINode->getNumIncomingValues() == NewPHINodeOps.size()) {
-      IRBuilder<> Builder(Ele.first);
-      auto NPHI = Builder.CreatePHI(NewPHINodeOps[0].first->getType(),
-                                    NewPHINodeOps.size());
-      for (auto IncVals : NewPHINodeOps) {
-        NPHI->addIncoming(IncVals.first, IncVals.second);
-        LLVM_DEBUG(dbgs() << "  Using: " << *IncVals.first
-                          << "  For: " << IncVals.second->getName() << "\n");
-      }
-      LLVM_DEBUG(dbgs() << "Sucessfully replaced with " << *NPHI << "\n");
-      LiveRegConversion ToLRC(NPHI, ThePHINode->getType(),
-                              ThePHINode->getParent(),
-                              static_cast<BasicBlock::iterator>(
-                                  ThePHINode->getParent()->getFirstNonPHIIt()));
-      convertFromOptType(ToLRC);
-      assert(ToLRC.hasConverted());
-      Ele.first->replaceAllUsesWith(*ToLRC.getConverted());
-      // The old PHI is no longer used
-      ThePHINode->eraseFromParent();
-      MadeChange = true;
-    }
-  }
-  return MadeChange;
-}
-
-Type *LiveRegOptimizer::getCompatibleType(Instruction *InstToConvert) {
-  auto OriginalType = InstToConvert->getType();
-  assert(OriginalType->getScalarSizeInBits() <=
-         ConvertToScalar->getScalarSizeInBits());
-  auto VTy = dyn_cast<VectorType>(OriginalType);
-  if (!VTy)
-    return ConvertToScalar;
-
-  auto OriginalSize =
-      VTy->getScalarSizeInBits() * VTy->getElementCount().getFixedValue();
-  auto ConvertScalarSize = ConvertToScalar->getScalarSizeInBits();
-  auto ConvertEltCount =
-      (OriginalSize + ConvertScalarSize - 1) / ConvertScalarSize;
-
-  return VectorType::get(Type::getIntNTy(Mod->getContext(), ConvertScalarSize),
-                         llvm::ElementCount::getFixed(ConvertEltCount));
-}
-
-void LiveRegOptimizer::convertToOptType(LiveRegConversion &LR) {
-  if (LR.hasConverted()) {
-    LLVM_DEBUG(dbgs() << "\tAlready has converted def\n");
-    return;
-  }
-
-  auto VTy = dyn_cast<VectorType>(LR.getOriginalType());
-  assert(VTy);
-  auto NewVTy = dyn_cast<VectorType>(LR.getNewType());
-  assert(NewVTy);
-
-  auto V = static_cast<Value *>(LR.getLiveRegDef());
-  auto OriginalSize =
-      VTy->getScalarSizeInBits() * VTy->getElementCount().getFixedValue();
-  auto NewSize =
-      NewVTy->getScalarSizeInBits() * NewVTy->getElementCount().getFixedValue();
-
-  auto &Builder = LR.getConverBuilder();
-
-  // If there is a bitsize match, we can fit the old vector into a new vector of
-  // desired type
-  if (OriginalSize == NewSize) {
-    LR.setConverted(dyn_cast<Instruction>(Builder.CreateBitCast(V, NewVTy)));
-    LLVM_DEBUG(dbgs() << "\tConverted def to "
-                      << *(*LR.getConverted())->getType() << "\n");
-    return;
-  }
-
-  // If there is a bitsize mismatch, we must use a wider vector
-  assert(NewSize > OriginalSize);
-  auto ExpandedVecElementCount =
-      llvm::ElementCount::getFixed(NewSize / VTy->getScalarSizeInBits());
-
-  SmallVector<int, 8> ShuffleMask;
-  for (unsigned I = 0; I < VTy->getElementCount().getFixedValue(); I++)
-    ShuffleMask.push_back(I);
-
-  for (uint64_t I = VTy->getElementCount().getFixedValue();
-       I < ExpandedVecElementCount.getFixedValue(); I++)
-    ShuffleMask.push_back(VTy->getElementCount().getFixedValue());
-
-  auto ExpandedVec =
-      dyn_cast<Instruction>(Builder.CreateShuffleVector(V, ShuffleMask));
-  LR.setConverted(
-      dyn_cast<Instruction>(Builder.CreateBitCast(ExpandedVec, NewVTy)));
-  LLVM_DEBUG(dbgs() << "\tConverted def to " << *(*LR.getConverted())->getType()
-                    << "\n");
-  return;
-}
-
-void LiveRegOptimizer::convertFromOptType(LiveRegConversion &LRC) {
-  auto VTy = dyn_cast<VectorType>(LRC.getOriginalType());
-  assert(VTy);
-  auto NewVTy = dyn_cast<VectorType>(LRC.getNewType());
-  assert(NewVTy);
-
-  auto V = static_cast<Value *>(LRC.getLiveRegDef());
-  auto OriginalSize =
-      VTy->getScalarSizeInBits() * VTy->getElementCount().getFixedValue();
-  auto NewSize =
-      NewVTy->getScalarSizeInBits() * NewVTy->getElementCount().getFixedValue();
-
-  auto &Builder = LRC.getConverBuilder();
-
-  // If there is a bitsize match, we simply convert back to the original type
-  if (OriginalSize == NewSize) {
-    LRC.setConverted(dyn_cast<Instruction>(Builder.CreateBitCast(V, NewVTy)));
-    LLVM_DEBUG(dbgs() << "\tProduced for user: " << **LRC.getConverted()
-                      << "\n");
-    return;
-  }
-
-  // If there is a bitsize mismatch, we have used a wider vector and must strip
-  // the MSBs to convert back to the original type
-  assert(OriginalSize > NewSize);
-  auto ExpandedVecElementCount = llvm::ElementCount::getFixed(
-      OriginalSize / NewVTy->getScalarSizeInBits());
-  auto ExpandedVT = VectorType::get(
-      Type::getIntNTy(Mod->getContext(), NewVTy->getScalarSizeInBits()),
-      ExpandedVecElementCount);
-  auto Converted = dyn_cast<Instruction>(
-      Builder.CreateBitCast(LRC.getLiveRegDef(), ExpandedVT));
-
-  auto NarrowElementCount = NewVTy->getElementCount().getFixedValue();
-  SmallVector<int, 8> ShuffleMask;
-  for (uint64_t I = 0; I < NarrowElementCount; I++)
-    ShuffleMask.push_back(I);
-
-  auto NarrowVec = dyn_cast<Instruction>(
-      Builder.CreateShuffleVector(Converted, ShuffleMask));
-  LRC.setConverted(dyn_cast<Instruction>(NarrowVec));
-  LLVM_DEBUG(dbgs() << "\tProduced for user: " << **LRC.getConverted() << "\n");
-  return;
-}
-
-bool LiveRegOptimizer::shouldReplaceUses(const Instruction &I) {
-  // Vectors of illegal types are copied across blocks in an efficient manner.
-  // They are scalarized and widened to legal scalars. In such cases, we can do
-  // better by using legal vector types
-  auto IType = I.getType();
-  return IType->isVectorTy() && IType->getScalarSizeInBits() < 16 &&
-         !I.getType()->getScalarType()->isPointerTy();
-}
-
 unsigned AMDGPUCodeGenPrepareImpl::getBaseElementBitWidth(const Type *T) const {
   assert(needsPromotionToI32(T) && "T does not need promotion to i32");
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
index 69fdeaebe0a018..f19f145ae86064 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
@@ -81,6 +81,85 @@ class AMDGPULateCodeGenPrepare
   bool visitLoadInst(LoadInst &LI);
 };
 
+class LiveRegConversion {
+private:
+  // The instruction which defined the original virtual register used across
+  // blocks
+  Instruction *LiveRegDef;
+  // The original type
+  Type *OriginalType;
+  // The desired type
+  Type *NewType;
+  // The instruction sequence that converts the virtual register, to be used
+  // instead of the original
+  std::optional<Instruction *> Converted;
+  // The builder used to build the conversion instruction
+  IRBuilder<> ConvertBuilder;
+
+public:
+  // The instruction which defined the original virtual register used across
+  // blocks
+  Instruction *getLiveRegDef() { return LiveRegDef; }
+  // The original type
+  Type *getOriginalType() { return OriginalType; }
+  // The desired type
+  Type *getNewType() { return NewType; }
+  void setNewType(Type *NewType) { this->NewType = NewType; }
+  // The instruction that conerts the virtual register, to be used instead of
+  // the original
+  std::optional<Instruction *> &getConverted() { return Converted; }
+  void setConverted(Instruction *Converted) { this->Converted = Converted; }
+  // The builder used to build the conversion instruction
+  IRBuilder<> &getConvertBuilder() { return ConvertBuilder; }
+  // Do we have a instruction sequence which convert the original virtual
+  // register
+  bool hasConverted() { return Converted.has_value(); }
+
+  LiveRegConversion(Instruction *LiveRegDef, BasicBlock *InsertBlock,
+                    BasicBlock::iterator InsertPt)
+      : LiveRegDef(LiveRegDef), OriginalType(LiveRegDef->getType()),
+        ConvertBuilder(InsertBlock, InsertPt) {}
+  LiveRegConversion(Instruction *LiveRegDef, Type *NewType,
+                    BasicBlock *InsertBlock, BasicBlock::iterator InsertPt)
+      : LiveRegDef(LiveRegDef), OriginalType(LiveRegDef->getType()),
+        NewType(NewType), ConvertBuilder(InsertBlock, InsertPt) {}
+};
+
+class LiveRegOptimizer {
+private:
+  Module *Mod = nullptr;
+  // The scalar type to convert to
+  Type *ConvertToScalar;
+  // Holds the collection of PHIs with their pending new operands
+  SmallVector<std::pair<Instruction *,
+                        SmallVector<std::pair<Instruction *, BasicBlock *>, 4>>,
+              4>
+      PHIUpdater;
+
+public:
+  // Should the def of the instruction be converted if it is live across blocks
+  bool shouldReplaceUses(const Instruction &I);
+  // Convert the virtual register to the compatible vector of legal type
+  void convertToOptType(LiveRegConversion &LR);
+  // Convert the virtual register back to the original type, stripping away
+  // the MSBs in cases where there was an imperfect fit (e.g. v2i32 -> v7i8)
+  void convertFromOptType(LiveRegConversion &LR);
+  // Get a vector of desired scalar type that is compatible with the original
+  // vector. In cases where there is no bitsize equivalent using a legal vector
+  // type, we pad the MSBs (e.g. v7i8 -> v2i32)
+  Type *getCompatibleType(Instruction *InstToConvert);
+  // Find and replace uses of the virtual register in different block with a
+  // newly produced virtual register of legal type
+  bool replaceUses(Instruction &I);
+  // Replace the collected PHIs with newly produced incoming values. Replacement
+  // is only done if we have a replacement for each original incoming value.
+  bool replacePHIs();
+
+  LiveRegOptimizer(Module *Mod) : Mod(Mod) {
+    ConvertToScalar = Type::getInt32Ty(Mod->getContext());
+  }
+};
+
 } // end anonymous namespace
 
 bool AMDGPULateCodeGenPrepare::doInitialization(Module &M) {
@@ -102,14 +181,287 @@ bool AMDGPULateCodeGenPrepare::runOnFunction(Function &F) {
   AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
   UA = &getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
 
+  // "Optimize" the virtual regs that cross basic block boundaries. In such
+  // cases, vectors of illegal types will be scalarized and widened, with each
+  // scalar living in its own physical register. The optimization converts the
+  // vectors to equivalent vectors of legal type (which are convereted back
+  // before uses in subsequenmt blocks), to pack the bits into fewer physical
+  // registers (used in CopyToReg/CopyFromReg pairs).
+  LiveRegOptimizer LRO(Mod);
+
   bool Changed = false;
   for (auto &BB : F)
-    for (Instruction &I : llvm::make_early_inc_range(BB))
+    for (Instruction &I : llvm::make_early_inc_range(BB)) {
       Changed |= visit(I);
-
+      // GlobalISel should directly use the values, and do not need to emit
+      // CopyTo/CopyFrom Regs across blocks
+      if (TM.Options.EnableGlobalISel)
+        continue;
+      if (!LRO.shouldReplaceUses(I))
+        continue;
+      Changed |= LRO.replaceUses(I);
+    }
+
+  Changed |= LRO.replacePHIs();
   return Changed;
 }
 
+bool LiveRegOptimizer::replaceUses(Instruction &I) {
+  bool MadeChange = false;
+
+  struct ConvertUseInfo {
+    Instruction *Converted;
+    SmallVector<Instruction *, 4> Users;
+  };
+  DenseMap<BasicBlock *, ConvertUseInfo> UseConvertTracker;
+
+  LiveRegConversion FromLRC(
+      &I, I.getParent(),
+      static_cast<BasicBlock::iterator>(std::next(I.getIterator())));
+  FromLRC.setNewType(getCompatibleType(FromLRC.getLiveRegDef()));
+  for (auto IUser = I.user_begin(); IUser != I.user_end(); IUser++) {
+
+    if (auto UserInst = dyn_cast<Instruction>(*IUser)) {
+      if (UserInst->getParent() != I.getParent() || isa<PHINode>(UserInst)) {
+        LLVM_DEBUG(dbgs() << *UserInst << "\n\tUses "
+                          << *FromLRC.getOriginalType()
+                          << " from previous block. Needs conversion\n");
+        convertToOptType(FromLRC);
+        if (!FromLRC.hasConverted())
+          continue;
+        // If it is a PHI node, just create and collect the new operand. We can
+        // only replace the PHI node once we have converted all the operands
+        if (auto PhiInst = dyn_cast<PHINode>(UserInst)) {
+          for (unsigned Idx = 0; Idx < PhiInst->getNumIncomingValues(); Idx++) {
+            auto IncVal = PhiInst->getIncomingValue(Idx);
+            if (&I == dyn_cast<Instruction>(IncVal)) {
+              auto IncBlock = PhiInst->getIncomingBlock(Idx);
+              auto PHIOps = find_if(
+                  PHIUpdater,
+                  [&UserInst](
+                      std::pair<Instruction *,
+                                SmallVector<
+                                    std::pair<Instruction *, BasicBlock *>, 4>>
+                          &Entry) { return Entry.first == UserInst; });
+
+              if (PHIOps == PHIUpdater.end())
+                PHIUpdater.push_back(
+                    {UserInst, {{*FromLRC.getConverted(), IncBlock}}});
+              else
+                PHIOps->second.push_back({*FromLRC.getConverted(), IncBlock});
+
+              break;
+            }
+          }
+          continue;
+        }
+
+        // Do not create multiple conversion sequences if there are multiple
+        // uses in the same block
+        if (UseConvertTracker.contains(UserInst->getParent())) {
+          UseConvertTracker[UserInst->getParent()].Users.push_back(UserInst);
+          LLVM_DEBUG(dbgs() << "\tUser already has access to converted def\n");
+          continue;
+        }
+
+        LiveRegConversion ToLRC(*FromLRC.getConverted(), I.getType(),
+                                UserInst->getParent(),
+                                static_cast<BasicBlock::iterator>(
+                                    UserInst->getParent()->getFirstNonPHIIt()));
+        convertFromOptType(ToLRC);
+        assert(ToLRC.hasConverted());
+        UseConvertTracker[UserInst->getParent()] = {*ToLRC.getConverted(),
+                                                    {UserInst}};
+      }
+    }
+  }
+
+  // Replace uses of with in a separate loop that is not dependent upon the
+  // state of the uses
+  for (auto &Entry : UseConvertTracker) {
+    for (auto &UserInst : Entry.second.Users) {
+      LLVM_DEBUG(dbgs() << *UserInst
+                        << "\n\tNow uses: " << *Entry.second.Converted << "\n");
+      UserInst->replaceUsesOfWith(&I, Entry.second.Converted);
+      MadeChange = true;
+    }
+  }
+  return MadeChange;
+}
+
+bool LiveRegOptimizer::replacePHIs() {
+  bool MadeChange = false;
+  for (auto Ele : PHIUpdater) {
+    auto ThePHINode = cast<PHINode>(Ele.first);
+    auto NewPHINodeOps = Ele.second;
+    LLVM_DEBUG(dbgs() << "Attempting to replace: " << *ThePHINode << "\n");
+    // If we have conveted all the required operands, then do the replacement
+    if (ThePHINode->getNumIncomingValues() == NewPHINodeOps.size()) {
+      IRBuilder<> Builder(Ele.first);
+      auto NPHI = Builder.CreatePHI(NewPHINodeOps[0].first->getType(),
+                                    NewPHINodeOps.size());
+      for (auto IncVals : NewPHINodeOps) {
+        NPHI->addIncoming(IncVals.first, IncVals.second);
+        LLVM_DEBUG(dbgs() << "  Using: " << *IncVals.first
+                          << "  For: " << IncVals.second->getName() << "\n");
+      }
+      LLVM_DEBUG(dbgs() << "Sucessfully replaced with " << *NPHI << "\n");
+      LiveRegConversion ToLRC(NPHI, ThePHINode->getType(),
+                              ThePHINode->getParent(),
+                              static_cast<BasicBlock::iterator>(
+                                  ThePHINode->getParent()->getFirstNonPHIIt()));
+      convertFromOptType(ToLRC);
+      assert(ToLRC.hasConverted());
+      Ele.first->replaceAllUsesWith(*ToLRC.getConverted());
+      // The old PHI is no longer used
+      ThePHINode->eraseFromParent();
+      MadeChange = true;
+    }
+  }
+  return MadeChange;
+}
+
+Type *LiveRegOptimizer::getCompatibleType(Instruction *InstToConvert) {
+  auto OriginalType = InstToConvert->getType();
+  assert(OriginalType->getScalarSizeInBits() <=
+         ConvertToScalar->getScalarSizeInBits());
+  auto VTy = dyn_cast<VectorType>(OriginalType);
+  if (!VTy)
+    return ConvertToScalar;
+
+  auto OriginalSize =
+      VTy->getScalarSizeInBits() * VTy->getElementCount().getFixedValue();
+  auto ConvertScalarSize = ConvertToScalar->getScalarSizeInBits();
+  auto ConvertEltCount =
+      (OriginalSize + ConvertScalarSize - 1) / ConvertScalarSize;
+
+  if (OriginalSize <= ConvertScalarSize)
+    return IntegerType::get(Mod->getContext(), ConvertScalarSize);
+
+  return VectorType::get(Type::getIntNTy(Mod->getContext(), ConvertScalarSize),
+                         llvm::ElementCount::getFixed(ConvertEltCount));
+}
+
+void LiveRegOptimizer::convertToOptType(LiveRegConversion &LR) {
+  if (LR.hasConverted()) {
+    LLVM_DEBUG(dbgs() << "\tAlready has converted def\n");
+    return;
+  }
+
+  auto VTy = cast<VectorType>(LR.getOriginalType());
+
+  auto NewTy = LR.getNewType();
+  assert(NewTy);
+  auto NewVTy = NewTy->isVectorTy() ? cast<VectorType>(NewTy) : nullptr;
+
+  auto V = static_cast<Value *>(LR.getLiveRegDef());
+  auto OriginalSize =
+      VTy->getScalarSizeInBits() * VTy->getElementCount().getFixedValue();
+  auto NewSize = NewTy->isVectorTy()
+                     ? NewVTy->getScalarSizeInBits() *
+                           NewVTy->getElementCount().getFixedValue()
+                     : NewTy->getScalarSizeInBits();
+
+  auto &Builder = LR.getConvertBuilder();
+
+  // If there is a bitsize match, we can fit the old vector into a new vector of
+  // desired type
+  if (OriginalSize == NewSize) {
+    LR.setConverted(dyn_cast<Instruction>(Builder.CreateBitCast(V, NewTy)));
+    LLVM_DEBUG(dbgs() << "\tConverted def to "
+                      << *(*LR.getConverted())->getType() << "\n");
+    return;
+  }
+
+  // If there is a bitsize mismatch, we must use a wider vector
+  assert(NewSize > OriginalSize);
+  auto ExpandedVecElementCount =
+      llvm::ElementCount::getFixed(NewSize / VTy->getScalarSizeInBits());
+
+  SmallVector<int, 8> ShuffleMask;
+  for (unsigned I = 0; I < VTy->getElementCount().getFixedValue(); I++)
+    ShuffleMask.push_back(I);
+
+  for (uint64_t I = VTy->getElementCount().getFixedValue();
+       I < ExpandedVecElementCount.getFixedValue(); I++)
+    ShuffleMask.push_back(VTy->getElementCount().getFixedValue());
+
+  auto ExpandedVec =
+      dyn_cast<Instruction>(Builder.CreateShuffleVector(V, ShuffleMask));
+  LR.setConverted(
+      dyn_cast<Instruction>(Builder.CreateBitCast(ExpandedVec, NewTy)));
+  LLVM_DEBUG(dbgs() << "\tConverted def to " << *(*LR.getConverted())->getType()
+                    << "\n");
+  return;
+}
+
+void LiveRegOptimizer::convertFromOptType(LiveRegConversion &LRC) {
+  auto OTy = LRC.getOriginalType();
+  auto VTy =
+      OTy->isVectorTy() ? dyn_cast<VectorType>(LRC.getOriginalType()) : nullptr;
+
+  auto NewVTy = cast<VectorType>(LRC.getNewType());
+
+  auto V = static_cast<Value *>(LRC.getLiveRegDef());
+  auto OriginalSize =
+      OTy->isVectorTy()
+          ? VTy->getScalarSizeInBits() * VTy->getElementCount().getFixedValue()
+          : OTy->getScalarSizeInBits();
+  auto NewSize =
+      NewVTy->getScalarSizeInBits() * NewVTy->getElementCount().getFixedValue();
+
+  auto &Builder = LRC.getConvertBuilder();
+
+  // If there is a bitsize match, we simply convert back to the original type
+  if (OriginalSize == NewSize) {
+    LRC.setConverted(dyn_cast<Instruction>(Builder.CreateBitCast(V, NewVTy)));
+    LLVM_DEBUG(dbgs() << "\tProduced for user: " << **LRC.getConverted()
+                      << "\n");
+    return;
+  }
+
+  if (!OTy->isVectorTy()) {
+    auto Trunc = dyn_cast<Instruction>(Builder.CreateTrunc(
+        LRC.getLiveRegDef(), IntegerType::get(Mod->getContext(), NewSize)));
+    auto Original = dyn_cast<Instruction>(Builder.CreateBitCast(Trunc, NewVTy));
+    LRC.setConverted(dyn_cast<Instruction>(Original));
+    LLVM_DEBUG(dbgs() << "\tProduced for user: " << **LRC.getConverted()
+                      << "\n");
+    return;
+  }
+
+  // If there is a bitsize mismatch, we have used a wider vector and must strip
+  // the MSBs to convert back to the original type
+  assert(OriginalSize > NewSize);
+  auto ExpandedVecElementCount = llvm::ElementCount::getFixed(
+      OriginalSize / NewVTy->getScalarSizeInBits());
+  auto ExpandedVT = VectorType::get(
+      Type::getIntNTy(Mod->getContext(), NewVTy->getScalarSizeInBits()),
+      ExpandedVecElementCount);
+  auto Converted = dyn_cast<Instruction>(
+      Builder.CreateBitCast(LRC.getLiveRegDef(), ExpandedVT));
+
+  auto NarrowElementCount = NewVTy->getElementCount().getFixedValue();
+  SmallVector<int, 8> ShuffleMask;
+  for (uint64_t I = 0; I < NarrowElementCount; I++)
+    ShuffleMask.push_back(I);
+
+  auto NarrowVec = dyn_cast<Instruction>(
+      Builder.CreateShuffleVector(Converted, ShuffleMask));
+  LRC.setConverted(dyn_cast<Instruction>(NarrowVec));
+  LLVM_DEBUG(dbgs() << "\tProduced for user: " << **LRC.getConverted() << "\n");
+  return;
+}
+
+bool LiveRegOptimizer::shouldReplaceUses(const Instruction &I) {
+  // Vectors of illegal types are copied across blocks in an efficient manner.
+  // They are scalarized and widened to legal scalars. In such cases, we can do
+  // better by using legal vector types
+  auto IType = I.getType();
+  return IType->isVectorTy() && IType->getScalarSizeInBits() < 16 &&
+         !I.getType()->getScalarType()->isPointerTy();
+}
+
 bool AMDGPULateCodeGenPrepare::canWidenScalarExtLoad(LoadInst &LI) const {
   unsigned AS = LI.getPointerAddressSpace();
   // Skip non-constant address space.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 305a6c8c3b9262..c15481336075e6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -1150,10 +1150,10 @@ bool GCNPassConfig::addPreISel() {
   AMDGPUPassConfig::addPreISel();
 
   if (TM->getOptLevel() > CodeGenOptLevel::None)
-    addPass(createAMDGPULateCodeGenPreparePass());
+    addPass(createSinkingPass());
 
   if (TM->getOptLevel() > CodeGenOptLevel::None)
-    addPass(createSinkingPass());
+    addPass(createAMDGPULateCodeGenPreparePass());
 
   // Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit
   // regions formed by them.
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-break-large-phis.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-break-large-phis.ll
index f4871fa1314426..11772d252a16fc 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-break-large-phis.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-break-large-phis.ll
@@ -495,15 +495,10 @@ define amdgpu_kernel void @phi_v23i8(<23 x i8> %in, ptr %out, i1 %cond) {
 ; OPT:       then:
 ; OPT-NEXT:    [[X:%.*]] = insertelement <23 x i8> [[IN:%.*]], i8 42, i32 3
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE0:%.*]] = shufflevector <23 x i8> [[X]], <23 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; OPT-NEXT:    [[TMP0:%.*]] = bitcast <4 x i8> [[LARGEPHI_EXTRACTSLICE0]] to <1 x i32>
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE2:%.*]] = shufflevector <23 x i8> [[X]], <23 x i8> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; OPT-NEXT:    [[TMP1:%.*]] = bitcast <4 x i8> [[LARGEPHI_EXTRACTSLICE2]] to <1 x i32>
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE4:%.*]] = shufflevector <23 x i8> [[X]], <23 x i8> poison, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; OPT-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[LARGEPHI_EXTRACTSLICE4]] to <1 x i32>
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE6:%.*]] = shufflevector <23 x i8> [[X]], <23 x i8> poison, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-; OPT-NEXT:    [[TMP3:%.*]] = bitcast <4 x i8> [[LARGEPHI_EXTRACTSLICE6]] to <1 x i32>
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE8:%.*]] = shufflevector <23 x i8> [[X]], <23 x i8> poison, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
-; OPT-NEXT:    [[TMP4:%.*]] = bitcast <4 x i8> [[LARGEPHI_EXTRACTSLICE8]] to <1 x i32>
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE10:%.*]] = extractelement <23 x i8> [[X]], i64 20
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE12:%.*]] = extractelement <23 x i8> [[X]], i64 21
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE14:%.*]] = extractelement <23 x i8> [[X]], i64 22
@@ -511,41 +506,31 @@ define amdgpu_kernel void @phi_v23i8(<23 x i8> %in, ptr %out, i1 %cond) {
 ; OPT:       else:
 ; OPT-NEXT:    [[Y:%.*]] = insertelement <23 x i8> [[IN]], i8 64, i32 6
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE1:%.*]] = shufflevector <23 x i8> [[Y]], <23 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; OPT-NEXT:    [[TMP5:%.*]] = bitcast <4 x i8> [[LARGEPHI_EXTRACTSLICE1]] to <1 x i32>
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE3:%.*]] = shufflevector <23 x i8> [[Y]], <23 x i8> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; OPT-NEXT:    [[TMP6:%.*]] = bitcast <4 x i8> [[LARGEPHI_EXTRACTSLICE3]] to <1 x i32>
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE5:%.*]] = shufflevector <23 x i8> [[Y]], <23 x i8> poison, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; OPT-NEXT:    [[TMP7:%.*]] = bitcast <4 x i8> [[LARGEPHI_EXTRACTSLICE5]] to <1 x i32>
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE7:%.*]] = shufflevector <23 x i8> [[Y]], <23 x i8> poison, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-; OPT-NEXT:    [[TMP8:%.*]] = bitcast <4 x i8> [[LARGEPHI_EXTRACTSLICE7]] to <1 x i32>
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE9:%.*]] = shufflevector <23 x i8> [[Y]], <23 x i8> poison, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
-; OPT-NEXT:    [[TMP9:%.*]] = bitcast <4 x i8> [[LARGEPHI_EXTRACTSLICE9]] to <1 x i32>
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE11:%.*]] = extractelement <23 x i8> [[Y]], i64 20
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE13:%.*]] = extractelement <23 x i8> [[Y]], i64 21
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE15:%.*]] = extractelement <23 x i8> [[Y]], i64 22
 ; OPT-NEXT:    br label [[FINALLY]]
 ; OPT:       finally:
-; OPT-NEXT:    [[TMP10:%.*]] = phi <1 x i32> [ [[TMP0]], [[THEN]] ], [ [[TMP5]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP11:%.*]] = phi <1 x i32> [ [[TMP1]], [[THEN]] ], [ [[TMP6]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP12:%.*]] = phi <1 x i32> [ [[TMP2]], [[THEN]] ], [ [[TMP7]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP13:%.*]] = phi <1 x i32> [ [[TMP3]], [[THEN]] ], [ [[TMP8]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP14:%.*]] = phi <1 x i32> [ [[TMP4]], [[THEN]] ], [ [[TMP9]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP15:%.*]] = phi i8 [ [[LARGEPHI_EXTRACTSLICE10]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE11]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP16:%.*]] = phi i8 [ [[LARGEPHI_EXTRACTSLICE12]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE13]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP17:%.*]] = phi i8 [ [[LARGEPHI_EXTRACTSLICE14]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE15]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP18:%.*]] = bitcast <1 x i32> [[TMP14]] to <4 x i8>
-; OPT-NEXT:    [[TMP19:%.*]] = bitcast <1 x i32> [[TMP13]] to <4 x i8>
-; OPT-NEXT:    [[TMP20:%.*]] = bitcast <1 x i32> [[TMP12]] to <4 x i8>
-; OPT-NEXT:    [[TMP21:%.*]] = bitcast <1 x i32> [[TMP11]] to <4 x i8>
-; OPT-NEXT:    [[TMP22:%.*]] = bitcast <1 x i32> [[TMP10]] to <4 x i8>
-; OPT-NEXT:    [[LARGEPHI_INSERTSLICE0:%.*]] = call <23 x i8> @llvm.vector.insert.v23i8.v4i8(<23 x i8> poison, <4 x i8> [[TMP22]], i64 0)
-; OPT-NEXT:    [[LARGEPHI_INSERTSLICE1:%.*]] = call <23 x i8> @llvm.vector.insert.v23i8.v4i8(<23 x i8> [[LARGEPHI_INSERTSLICE0]], <4 x i8> [[TMP21]], i64 4)
-; OPT-NEXT:    [[LARGEPHI_INSERTSLICE2:%.*]] = call <23 x i8> @llvm.vector.insert.v23i8.v4i8(<23 x i8> [[LARGEPHI_INSERTSLICE1]], <4 x i8> [[TMP20]], i64 8)
-; OPT-NEXT:    [[LARGEPHI_INSERTSLICE3:%.*]] = call <23 x i8> @llvm.vector.insert.v23i8.v4i8(<23 x i8> [[LARGEPHI_INSERTSLICE2]], <4 x i8> [[TMP19]], i64 12)
-; OPT-NEXT:    [[LARGEPHI_INSERTSLICE4:%.*]] = call <23 x i8> @llvm.vector.insert.v23i8.v4i8(<23 x i8> [[LARGEPHI_INSERTSLICE3]], <4 x i8> [[TMP18]], i64 16)
-; OPT-NEXT:    [[LARGEPHI_INSERTSLICE5:%.*]] = insertelement <23 x i8> [[LARGEPHI_INSERTSLICE4]], i8 [[TMP15]], i64 20
-; OPT-NEXT:    [[LARGEPHI_INSERTSLICE6:%.*]] = insertelement <23 x i8> [[LARGEPHI_INSERTSLICE5]], i8 [[TMP16]], i64 21
-; OPT-NEXT:    [[LARGEPHI_INSERTSLICE7:%.*]] = insertelement <23 x i8> [[LARGEPHI_INSERTSLICE6]], i8 [[TMP17]], i64 22
+; OPT-NEXT:    [[TMP0:%.*]] = phi <4 x i8> [ [[LARGEPHI_EXTRACTSLICE0]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE1]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP1:%.*]] = phi <4 x i8> [ [[LARGEPHI_EXTRACTSLICE2]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE3]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP2:%.*]] = phi <4 x i8> [ [[LARGEPHI_EXTRACTSLICE4]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE5]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP3:%.*]] = phi <4 x i8> [ [[LARGEPHI_EXTRACTSLICE6]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE7]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP4:%.*]] = phi <4 x i8> [ [[LARGEPHI_EXTRACTSLICE8]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE9]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP5:%.*]] = phi i8 [ [[LARGEPHI_EXTRACTSLICE10]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE11]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP6:%.*]] = phi i8 [ [[LARGEPHI_EXTRACTSLICE12]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE13]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP7:%.*]] = phi i8 [ [[LARGEPHI_EXTRACTSLICE14]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE15]], [[ELSE]] ]
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE0:%.*]] = call <23 x i8> @llvm.vector.insert.v23i8.v4i8(<23 x i8> poison, <4 x i8> [[TMP0]], i64 0)
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE1:%.*]] = call <23 x i8> @llvm.vector.insert.v23i8.v4i8(<23 x i8> [[LARGEPHI_INSERTSLICE0]], <4 x i8> [[TMP1]], i64 4)
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE2:%.*]] = call <23 x i8> @llvm.vector.insert.v23i8.v4i8(<23 x i8> [[LARGEPHI_INSERTSLICE1]], <4 x i8> [[TMP2]], i64 8)
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE3:%.*]] = call <23 x i8> @llvm.vector.insert.v23i8.v4i8(<23 x i8> [[LARGEPHI_INSERTSLICE2]], <4 x i8> [[TMP3]], i64 12)
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE4:%.*]] = call <23 x i8> @llvm.vector.insert.v23i8.v4i8(<23 x i8> [[LARGEPHI_INSERTSLICE3]], <4 x i8> [[TMP4]], i64 16)
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE5:%.*]] = insertelement <23 x i8> [[LARGEPHI_INSERTSLICE4]], i8 [[TMP5]], i64 20
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE6:%.*]] = insertelement <23 x i8> [[LARGEPHI_INSERTSLICE5]], i8 [[TMP6]], i64 21
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE7:%.*]] = insertelement <23 x i8> [[LARGEPHI_INSERTSLICE6]], i8 [[TMP7]], i64 22
 ; OPT-NEXT:    store <23 x i8> [[LARGEPHI_INSERTSLICE7]], ptr [[OUT:%.*]], align 1
 ; OPT-NEXT:    ret void
 ;
@@ -554,19 +539,13 @@ define amdgpu_kernel void @phi_v23i8(<23 x i8> %in, ptr %out, i1 %cond) {
 ; NOOPT-NEXT:    br i1 [[COND:%.*]], label [[THEN:%.*]], label [[ELSE:%.*]]
 ; NOOPT:       then:
 ; NOOPT-NEXT:    [[X:%.*]] = insertelement <23 x i8> [[IN:%.*]], i8 42, i32 3
-; NOOPT-NEXT:    [[TMP0:%.*]] = shufflevector <23 x i8> [[X]], <23 x i8> poison, <24 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; NOOPT-NEXT:    [[TMP1:%.*]] = bitcast <24 x i8> [[TMP0]] to <6 x i32>
 ; NOOPT-NEXT:    br label [[FINALLY:%.*]]
 ; NOOPT:       else:
 ; NOOPT-NEXT:    [[Y:%.*]] = insertelement <23 x i8> [[IN]], i8 64, i32 6
-; NOOPT-NEXT:    [[TMP2:%.*]] = shufflevector <23 x i8> [[Y]], <23 x i8> poison, <24 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; NOOPT-NEXT:    [[TMP3:%.*]] = bitcast <24 x i8> [[TMP2]] to <6 x i32>
 ; NOOPT-NEXT:    br label [[FINALLY]]
 ; NOOPT:       finally:
-; NOOPT-NEXT:    [[TMP4:%.*]] = phi <6 x i32> [ [[TMP1]], [[THEN]] ], [ [[TMP3]], [[ELSE]] ]
-; NOOPT-NEXT:    [[TMP5:%.*]] = bitcast <6 x i32> [[TMP4]] to <24 x i8>
-; NOOPT-NEXT:    [[TMP6:%.*]] = shufflevector <24 x i8> [[TMP5]], <24 x i8> poison, <23 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22>
-; NOOPT-NEXT:    store <23 x i8> [[TMP6]], ptr [[OUT:%.*]], align 1
+; NOOPT-NEXT:    [[VAL:%.*]] = phi <23 x i8> [ [[X]], [[THEN]] ], [ [[Y]], [[ELSE]] ]
+; NOOPT-NEXT:    store <23 x i8> [[VAL]], ptr [[OUT:%.*]], align 1
 ; NOOPT-NEXT:    ret void
 ;
 entry:
@@ -593,36 +572,31 @@ define amdgpu_kernel void @phi_v23i8_zeroinit(<23 x i8> %in, ptr %out, i1 %cond)
 ; OPT:       else:
 ; OPT-NEXT:    [[Y:%.*]] = insertelement <23 x i8> [[IN:%.*]], i8 64, i32 6
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE1:%.*]] = shufflevector <23 x i8> [[Y]], <23 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; OPT-NEXT:    [[TMP0:%.*]] = bitcast <4 x i8> [[LARGEPHI_EXTRACTSLICE1]] to <1 x i32>
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE3:%.*]] = shufflevector <23 x i8> [[Y]], <23 x i8> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; OPT-NEXT:    [[TMP1:%.*]] = bitcast <4 x i8> [[LARGEPHI_EXTRACTSLICE3]] to <1 x i32>
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE5:%.*]] = shufflevector <23 x i8> [[Y]], <23 x i8> poison, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; OPT-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[LARGEPHI_EXTRACTSLICE5]] to <1 x i32>
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE7:%.*]] = shufflevector <23 x i8> [[Y]], <23 x i8> poison, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-; OPT-NEXT:    [[TMP3:%.*]] = bitcast <4 x i8> [[LARGEPHI_EXTRACTSLICE7]] to <1 x i32>
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE9:%.*]] = shufflevector <23 x i8> [[Y]], <23 x i8> poison, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
-; OPT-NEXT:    [[TMP4:%.*]] = bitcast <4 x i8> [[LARGEPHI_EXTRACTSLICE9]] to <1 x i32>
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE11:%.*]] = extractelement <23 x i8> [[Y]], i64 20
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE13:%.*]] = extractelement <23 x i8> [[Y]], i64 21
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE15:%.*]] = extractelement <23 x i8> [[Y]], i64 22
 ; OPT-NEXT:    br label [[FINALLY]]
 ; OPT:       finally:
-; OPT-NEXT:    [[TMP5:%.*]] = phi <4 x i8> [ zeroinitializer, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE1]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP6:%.*]] = phi <4 x i8> [ zeroinitializer, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE3]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP7:%.*]] = phi <4 x i8> [ zeroinitializer, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE5]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP8:%.*]] = phi <4 x i8> [ zeroinitializer, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE7]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP9:%.*]] = phi <4 x i8> [ zeroinitializer, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE9]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP10:%.*]] = phi i8 [ 0, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE11]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP11:%.*]] = phi i8 [ 0, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE13]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP12:%.*]] = phi i8 [ 0, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE15]], [[ELSE]] ]
-; OPT-NEXT:    [[LARGEPHI_INSERTSLICE0:%.*]] = call <23 x i8> @llvm.vector.insert.v23i8.v4i8(<23 x i8> poison, <4 x i8> [[TMP5]], i64 0)
-; OPT-NEXT:    [[LARGEPHI_INSERTSLICE1:%.*]] = call <23 x i8> @llvm.vector.insert.v23i8.v4i8(<23 x i8> [[LARGEPHI_INSERTSLICE0]], <4 x i8> [[TMP6]], i64 4)
-; OPT-NEXT:    [[LARGEPHI_INSERTSLICE2:%.*]] = call <23 x i8> @llvm.vector.insert.v23i8.v4i8(<23 x i8> [[LARGEPHI_INSERTSLICE1]], <4 x i8> [[TMP7]], i64 8)
-; OPT-NEXT:    [[LARGEPHI_INSERTSLICE3:%.*]] = call <23 x i8> @llvm.vector.insert.v23i8.v4i8(<23 x i8> [[LARGEPHI_INSERTSLICE2]], <4 x i8> [[TMP8]], i64 12)
-; OPT-NEXT:    [[LARGEPHI_INSERTSLICE4:%.*]] = call <23 x i8> @llvm.vector.insert.v23i8.v4i8(<23 x i8> [[LARGEPHI_INSERTSLICE3]], <4 x i8> [[TMP9]], i64 16)
-; OPT-NEXT:    [[LARGEPHI_INSERTSLICE5:%.*]] = insertelement <23 x i8> [[LARGEPHI_INSERTSLICE4]], i8 [[TMP10]], i64 20
-; OPT-NEXT:    [[LARGEPHI_INSERTSLICE6:%.*]] = insertelement <23 x i8> [[LARGEPHI_INSERTSLICE5]], i8 [[TMP11]], i64 21
-; OPT-NEXT:    [[LARGEPHI_INSERTSLICE7:%.*]] = insertelement <23 x i8> [[LARGEPHI_INSERTSLICE6]], i8 [[TMP12]], i64 22
+; OPT-NEXT:    [[TMP0:%.*]] = phi <4 x i8> [ zeroinitializer, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE1]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP1:%.*]] = phi <4 x i8> [ zeroinitializer, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE3]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP2:%.*]] = phi <4 x i8> [ zeroinitializer, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE5]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP3:%.*]] = phi <4 x i8> [ zeroinitializer, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE7]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP4:%.*]] = phi <4 x i8> [ zeroinitializer, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE9]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP5:%.*]] = phi i8 [ 0, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE11]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP6:%.*]] = phi i8 [ 0, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE13]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP7:%.*]] = phi i8 [ 0, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE15]], [[ELSE]] ]
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE0:%.*]] = call <23 x i8> @llvm.vector.insert.v23i8.v4i8(<23 x i8> poison, <4 x i8> [[TMP0]], i64 0)
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE1:%.*]] = call <23 x i8> @llvm.vector.insert.v23i8.v4i8(<23 x i8> [[LARGEPHI_INSERTSLICE0]], <4 x i8> [[TMP1]], i64 4)
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE2:%.*]] = call <23 x i8> @llvm.vector.insert.v23i8.v4i8(<23 x i8> [[LARGEPHI_INSERTSLICE1]], <4 x i8> [[TMP2]], i64 8)
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE3:%.*]] = call <23 x i8> @llvm.vector.insert.v23i8.v4i8(<23 x i8> [[LARGEPHI_INSERTSLICE2]], <4 x i8> [[TMP3]], i64 12)
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE4:%.*]] = call <23 x i8> @llvm.vector.insert.v23i8.v4i8(<23 x i8> [[LARGEPHI_INSERTSLICE3]], <4 x i8> [[TMP4]], i64 16)
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE5:%.*]] = insertelement <23 x i8> [[LARGEPHI_INSERTSLICE4]], i8 [[TMP5]], i64 20
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE6:%.*]] = insertelement <23 x i8> [[LARGEPHI_INSERTSLICE5]], i8 [[TMP6]], i64 21
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE7:%.*]] = insertelement <23 x i8> [[LARGEPHI_INSERTSLICE6]], i8 [[TMP7]], i64 22
 ; OPT-NEXT:    store <23 x i8> [[LARGEPHI_INSERTSLICE7]], ptr [[OUT:%.*]], align 1
 ; OPT-NEXT:    ret void
 ;
@@ -633,8 +607,6 @@ define amdgpu_kernel void @phi_v23i8_zeroinit(<23 x i8> %in, ptr %out, i1 %cond)
 ; NOOPT-NEXT:    br label [[FINALLY:%.*]]
 ; NOOPT:       else:
 ; NOOPT-NEXT:    [[Y:%.*]] = insertelement <23 x i8> [[IN:%.*]], i8 64, i32 6
-; NOOPT-NEXT:    [[TMP0:%.*]] = shufflevector <23 x i8> [[Y]], <23 x i8> poison, <24 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; NOOPT-NEXT:    [[TMP1:%.*]] = bitcast <24 x i8> [[TMP0]] to <6 x i32>
 ; NOOPT-NEXT:    br label [[FINALLY]]
 ; NOOPT:       finally:
 ; NOOPT-NEXT:    [[VAL:%.*]] = phi <23 x i8> [ zeroinitializer, [[THEN]] ], [ [[Y]], [[ELSE]] ]
@@ -663,28 +635,25 @@ define amdgpu_kernel void @phi_v15i8_random_constant_init(<15 x i8> %in, ptr %ou
 ; OPT:       else:
 ; OPT-NEXT:    [[Y:%.*]] = insertelement <15 x i8> [[IN:%.*]], i8 64, i32 6
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE1:%.*]] = shufflevector <15 x i8> [[Y]], <15 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; OPT-NEXT:    [[TMP0:%.*]] = bitcast <4 x i8> [[LARGEPHI_EXTRACTSLICE1]] to <1 x i32>
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE3:%.*]] = shufflevector <15 x i8> [[Y]], <15 x i8> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; OPT-NEXT:    [[TMP1:%.*]] = bitcast <4 x i8> [[LARGEPHI_EXTRACTSLICE3]] to <1 x i32>
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE5:%.*]] = shufflevector <15 x i8> [[Y]], <15 x i8> poison, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; OPT-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[LARGEPHI_EXTRACTSLICE5]] to <1 x i32>
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE7:%.*]] = extractelement <15 x i8> [[Y]], i64 12
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE9:%.*]] = extractelement <15 x i8> [[Y]], i64 13
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE11:%.*]] = extractelement <15 x i8> [[Y]], i64 14
 ; OPT-NEXT:    br label [[FINALLY]]
 ; OPT:       finally:
-; OPT-NEXT:    [[TMP3:%.*]] = phi <4 x i8> [ <i8 poison, i8 1, i8 2, i8 3>, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE1]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP4:%.*]] = phi <4 x i8> [ <i8 4, i8 undef, i8 6, i8 7>, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE3]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP5:%.*]] = phi <4 x i8> [ <i8 9, i8 10, i8 11, i8 12>, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE5]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP6:%.*]] = phi i8 [ 13, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE7]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP7:%.*]] = phi i8 [ 14, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE9]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP8:%.*]] = phi i8 [ undef, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE11]], [[ELSE]] ]
-; OPT-NEXT:    [[LARGEPHI_INSERTSLICE0:%.*]] = call <15 x i8> @llvm.vector.insert.v15i8.v4i8(<15 x i8> poison, <4 x i8> [[TMP3]], i64 0)
-; OPT-NEXT:    [[LARGEPHI_INSERTSLICE1:%.*]] = call <15 x i8> @llvm.vector.insert.v15i8.v4i8(<15 x i8> [[LARGEPHI_INSERTSLICE0]], <4 x i8> [[TMP4]], i64 4)
-; OPT-NEXT:    [[LARGEPHI_INSERTSLICE2:%.*]] = call <15 x i8> @llvm.vector.insert.v15i8.v4i8(<15 x i8> [[LARGEPHI_INSERTSLICE1]], <4 x i8> [[TMP5]], i64 8)
-; OPT-NEXT:    [[LARGEPHI_INSERTSLICE3:%.*]] = insertelement <15 x i8> [[LARGEPHI_INSERTSLICE2]], i8 [[TMP6]], i64 12
-; OPT-NEXT:    [[LARGEPHI_INSERTSLICE4:%.*]] = insertelement <15 x i8> [[LARGEPHI_INSERTSLICE3]], i8 [[TMP7]], i64 13
-; OPT-NEXT:    [[LARGEPHI_INSERTSLICE5:%.*]] = insertelement <15 x i8> [[LARGEPHI_INSERTSLICE4]], i8 [[TMP8]], i64 14
+; OPT-NEXT:    [[TMP0:%.*]] = phi <4 x i8> [ <i8 poison, i8 1, i8 2, i8 3>, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE1]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP1:%.*]] = phi <4 x i8> [ <i8 4, i8 undef, i8 6, i8 7>, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE3]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP2:%.*]] = phi <4 x i8> [ <i8 9, i8 10, i8 11, i8 12>, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE5]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP3:%.*]] = phi i8 [ 13, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE7]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP4:%.*]] = phi i8 [ 14, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE9]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP5:%.*]] = phi i8 [ undef, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE11]], [[ELSE]] ]
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE0:%.*]] = call <15 x i8> @llvm.vector.insert.v15i8.v4i8(<15 x i8> poison, <4 x i8> [[TMP0]], i64 0)
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE1:%.*]] = call <15 x i8> @llvm.vector.insert.v15i8.v4i8(<15 x i8> [[LARGEPHI_INSERTSLICE0]], <4 x i8> [[TMP1]], i64 4)
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE2:%.*]] = call <15 x i8> @llvm.vector.insert.v15i8.v4i8(<15 x i8> [[LARGEPHI_INSERTSLICE1]], <4 x i8> [[TMP2]], i64 8)
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE3:%.*]] = insertelement <15 x i8> [[LARGEPHI_INSERTSLICE2]], i8 [[TMP3]], i64 12
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE4:%.*]] = insertelement <15 x i8> [[LARGEPHI_INSERTSLICE3]], i8 [[TMP4]], i64 13
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE5:%.*]] = insertelement <15 x i8> [[LARGEPHI_INSERTSLICE4]], i8 [[TMP5]], i64 14
 ; OPT-NEXT:    store <15 x i8> [[LARGEPHI_INSERTSLICE5]], ptr [[OUT:%.*]], align 1
 ; OPT-NEXT:    ret void
 ;
@@ -695,8 +664,6 @@ define amdgpu_kernel void @phi_v15i8_random_constant_init(<15 x i8> %in, ptr %ou
 ; NOOPT-NEXT:    br label [[FINALLY:%.*]]
 ; NOOPT:       else:
 ; NOOPT-NEXT:    [[Y:%.*]] = insertelement <15 x i8> [[IN:%.*]], i8 64, i32 6
-; NOOPT-NEXT:    [[TMP0:%.*]] = shufflevector <15 x i8> [[Y]], <15 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; NOOPT-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
 ; NOOPT-NEXT:    br label [[FINALLY]]
 ; NOOPT:       finally:
 ; NOOPT-NEXT:    [[VAL:%.*]] = phi <15 x i8> [ <i8 poison, i8 1, i8 2, i8 3, i8 4, i8 undef, i8 6, i8 7, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 undef>, [[THEN]] ], [ [[Y]], [[ELSE]] ]
@@ -1020,8 +987,8 @@ define amdgpu_kernel void @phi_v7i16_switch(<7 x i16> %in, ptr %out, i8 %cond) {
 ; OPT-NEXT:  entry:
 ; OPT-NEXT:    [[X:%.*]] = insertelement <7 x i16> [[IN:%.*]], i16 3, i32 3
 ; OPT-NEXT:    switch i8 [[COND:%.*]], label [[ELSE:%.*]] [
-; OPT-NEXT:    i8 0, label [[THEN_1:%.*]]
-; OPT-NEXT:    i8 3, label [[THEN_2:%.*]]
+; OPT-NEXT:      i8 0, label [[THEN_1:%.*]]
+; OPT-NEXT:      i8 3, label [[THEN_2:%.*]]
 ; OPT-NEXT:    ]
 ; OPT:       then.1:
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE0:%.*]] = shufflevector <7 x i16> [[X]], <7 x i16> poison, <2 x i32> <i32 0, i32 1>
@@ -1058,8 +1025,8 @@ define amdgpu_kernel void @phi_v7i16_switch(<7 x i16> %in, ptr %out, i8 %cond) {
 ; NOOPT-NEXT:  entry:
 ; NOOPT-NEXT:    [[X:%.*]] = insertelement <7 x i16> [[IN:%.*]], i16 3, i32 3
 ; NOOPT-NEXT:    switch i8 [[COND:%.*]], label [[ELSE:%.*]] [
-; NOOPT-NEXT:    i8 0, label [[THEN_1:%.*]]
-; NOOPT-NEXT:    i8 3, label [[THEN_2:%.*]]
+; NOOPT-NEXT:      i8 0, label [[THEN_1:%.*]]
+; NOOPT-NEXT:      i8 3, label [[THEN_2:%.*]]
 ; NOOPT-NEXT:    ]
 ; NOOPT:       then.1:
 ; NOOPT-NEXT:    br label [[FINALLY:%.*]]
diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
index 0ff5dd3680dfab..29f9e3bf94d05e 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
@@ -251,13 +251,13 @@
 ; GCN-O1-NEXT:        Function Alias Analysis Results
 ; GCN-O1-NEXT:        Flatten the CFG
 ; GCN-O1-NEXT:        Dominator Tree Construction
-; GCN-O1-NEXT:        Cycle Info Analysis
-; GCN-O1-NEXT:        Uniformity Analysis
-; GCN-O1-NEXT:        AMDGPU IR late optimizations
 ; GCN-O1-NEXT:        Basic Alias Analysis (stateless AA impl)
 ; GCN-O1-NEXT:        Function Alias Analysis Results
 ; GCN-O1-NEXT:        Natural Loop Information
 ; GCN-O1-NEXT:        Code sinking
+; GCN-O1-NEXT:        Cycle Info Analysis
+; GCN-O1-NEXT:        Uniformity Analysis
+; GCN-O1-NEXT:        AMDGPU IR late optimizations
 ; GCN-O1-NEXT:        Post-Dominator Tree Construction
 ; GCN-O1-NEXT:        Unify divergent function exit nodes
 ; GCN-O1-NEXT:        Dominator Tree Construction
@@ -546,13 +546,13 @@
 ; GCN-O1-OPTS-NEXT:        Function Alias Analysis Results
 ; GCN-O1-OPTS-NEXT:        Flatten the CFG
 ; GCN-O1-OPTS-NEXT:        Dominator Tree Construction
-; GCN-O1-OPTS-NEXT:        Cycle Info Analysis
-; GCN-O1-OPTS-NEXT:        Uniformity Analysis
-; GCN-O1-OPTS-NEXT:        AMDGPU IR late optimizations
 ; GCN-O1-OPTS-NEXT:        Basic Alias Analysis (stateless AA impl)
 ; GCN-O1-OPTS-NEXT:        Function Alias Analysis Results
 ; GCN-O1-OPTS-NEXT:        Natural Loop Information
 ; GCN-O1-OPTS-NEXT:        Code sinking
+; GCN-O1-OPTS-NEXT:        Cycle Info Analysis
+; GCN-O1-OPTS-NEXT:        Uniformity Analysis
+; GCN-O1-OPTS-NEXT:        AMDGPU IR late optimizations
 ; GCN-O1-OPTS-NEXT:        Post-Dominator Tree Construction
 ; GCN-O1-OPTS-NEXT:        Unify divergent function exit nodes
 ; GCN-O1-OPTS-NEXT:        Dominator Tree Construction
@@ -853,13 +853,13 @@
 ; GCN-O2-NEXT:        Function Alias Analysis Results
 ; GCN-O2-NEXT:        Flatten the CFG
 ; GCN-O2-NEXT:        Dominator Tree Construction
-; GCN-O2-NEXT:        Cycle Info Analysis
-; GCN-O2-NEXT:        Uniformity Analysis
-; GCN-O2-NEXT:        AMDGPU IR late optimizations
 ; GCN-O2-NEXT:        Basic Alias Analysis (stateless AA impl)
 ; GCN-O2-NEXT:        Function Alias Analysis Results
 ; GCN-O2-NEXT:        Natural Loop Information
 ; GCN-O2-NEXT:        Code sinking
+; GCN-O2-NEXT:        Cycle Info Analysis
+; GCN-O2-NEXT:        Uniformity Analysis
+; GCN-O2-NEXT:        AMDGPU IR late optimizations
 ; GCN-O2-NEXT:        Post-Dominator Tree Construction
 ; GCN-O2-NEXT:        Unify divergent function exit nodes
 ; GCN-O2-NEXT:        Dominator Tree Construction
@@ -1174,13 +1174,13 @@
 ; GCN-O3-NEXT:        Function Alias Analysis Results
 ; GCN-O3-NEXT:        Flatten the CFG
 ; GCN-O3-NEXT:        Dominator Tree Construction
-; GCN-O3-NEXT:        Cycle Info Analysis
-; GCN-O3-NEXT:        Uniformity Analysis
-; GCN-O3-NEXT:        AMDGPU IR late optimizations
 ; GCN-O3-NEXT:        Basic Alias Analysis (stateless AA impl)
 ; GCN-O3-NEXT:        Function Alias Analysis Results
 ; GCN-O3-NEXT:        Natural Loop Information
 ; GCN-O3-NEXT:        Code sinking
+; GCN-O3-NEXT:        Cycle Info Analysis
+; GCN-O3-NEXT:        Uniformity Analysis
+; GCN-O3-NEXT:        AMDGPU IR late optimizations
 ; GCN-O3-NEXT:        Post-Dominator Tree Construction
 ; GCN-O3-NEXT:        Unify divergent function exit nodes
 ; GCN-O3-NEXT:        Dominator Tree Construction
diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll
index c9dbadcbd23157..cacdc8237d5f31 100644
--- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll
@@ -2103,10 +2103,7 @@ define void @crash_lshlrevb16_not_reg_op() {
 ; NOSDWA:       ; %bb.0: ; %bb0
 ; NOSDWA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; NOSDWA-NEXT:    s_mov_b64 s[4:5], 0
-; NOSDWA-NEXT:    v_mov_b32_e32 v0, 0xff
-; NOSDWA-NEXT:    v_and_b32_e32 v0, s4, v0
-; NOSDWA-NEXT:    v_lshlrev_b16_e64 v1, 8, 1
-; NOSDWA-NEXT:    v_or_b32_e32 v0, v0, v1
+; NOSDWA-NEXT:    v_mov_b32_e32 v0, 0x100
 ; NOSDWA-NEXT:    s_and_b64 vcc, exec, -1
 ; NOSDWA-NEXT:  .LBB22_1: ; %bb1
 ; NOSDWA-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -2126,9 +2123,7 @@ define void @crash_lshlrevb16_not_reg_op() {
 ; GFX89:       ; %bb.0: ; %bb0
 ; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    s_mov_b64 s[4:5], 0
-; GFX89-NEXT:    v_lshlrev_b16_e64 v0, 8, 1
-; GFX89-NEXT:    v_mov_b32_e32 v1, s4
-; GFX89-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX89-NEXT:    v_mov_b32_e32 v0, 0x100
 ; GFX89-NEXT:    s_and_b64 vcc, exec, -1
 ; GFX89-NEXT:  .LBB22_1: ; %bb1
 ; GFX89-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -2148,8 +2143,7 @@ define void @crash_lshlrevb16_not_reg_op() {
 ; GFX9:       ; %bb.0: ; %bb0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b64 s[4:5], 0
-; GFX9-NEXT:    v_lshlrev_b16_e64 v0, 8, 1
-; GFX9-NEXT:    v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0x100
 ; GFX9-NEXT:    s_and_b64 vcc, exec, -1
 ; GFX9-NEXT:  .LBB22_1: ; %bb1
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -2168,18 +2162,16 @@ define void @crash_lshlrevb16_not_reg_op() {
 ; GFX10-LABEL: crash_lshlrevb16_not_reg_op:
 ; GFX10:       ; %bb.0: ; %bb0
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b16 v0, 8, 1
-; GFX10-NEXT:    s_mov_b32 vcc_lo, exec_lo
-; GFX10-NEXT:    v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX10-NEXT:    s_mov_b64 s[4:5], 0
+; GFX10-NEXT:    s_mov_b32 vcc_lo, exec_lo
 ; GFX10-NEXT:  .LBB22_1: ; %bb1
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_lshl_b32 s6, s4, 3
-; GFX10-NEXT:    v_mov_b32_e32 v1, s4
-; GFX10-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-NEXT:    v_lshrrev_b16 v3, s6, v0
+; GFX10-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-NEXT:    v_lshrrev_b16 v2, s6, 0x100
 ; GFX10-NEXT:    s_mov_b64 s[4:5], 1
-; GFX10-NEXT:    flat_store_byte v[1:2], v3
+; GFX10-NEXT:    flat_store_byte v[0:1], v2
 ; GFX10-NEXT:    s_cbranch_vccnz .LBB22_1
 ; GFX10-NEXT:  ; %bb.2: ; %DummyReturnBlock
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/vni8-loop-carried.ll b/llvm/test/CodeGen/AMDGPU/vni8-loop-carried.ll
new file mode 100644
index 00000000000000..1020990edecac2
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/vni8-loop-carried.ll
@@ -0,0 +1,46 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc -mtriple=amdgcn -mcpu=gfx906 < %s | FileCheck --check-prefix=GFX906 %s
+
+define amdgpu_kernel void @v32i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
+; GFX906-LABEL: v32i8_liveout:
+; GFX906:       ; %bb.0: ; %entry
+; GFX906-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX906-NEXT:    v_lshlrev_b32_e32 v1, 5, v0
+; GFX906-NEXT:    v_cmp_lt_u32_e32 vcc, 14, v0
+; GFX906-NEXT:    s_mov_b32 s4, 0x2000604
+; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX906-NEXT:    global_load_dword v1, v1, s[2:3]
+; GFX906-NEXT:    s_mov_b64 s[2:3], 0
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_mov_b32_e32 v0, v1
+; GFX906-NEXT:  .LBB0_1: ; %bb.1
+; GFX906-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX906-NEXT:    s_and_b64 s[6:7], exec, vcc
+; GFX906-NEXT:    s_or_b64 s[2:3], s[6:7], s[2:3]
+; GFX906-NEXT:    v_perm_b32 v0, v1, v0, s4
+; GFX906-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX906-NEXT:    s_cbranch_execnz .LBB0_1
+; GFX906-NEXT:  ; %bb.2: ; %bb.2.loopexit
+; GFX906-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX906-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX906-NEXT:    v_mov_b32_e32 v1, 0
+; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX906-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX906-NEXT:    s_endpgm
+entry:
+  %idx = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep1 = getelementptr <32 x i8>, ptr addrspace(1) %src1, i32 %idx
+  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
+  br label %bb.1
+
+bb.1:
+  %temp = phi <4 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ]
+  %vec2 = shufflevector <4 x i8> %vec1, <4 x i8> %temp, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %cmp = icmp ult i32 %idx, 15
+  br i1 %cmp, label %bb.1, label %bb.2
+  br label %bb.2
+
+bb.2:
+  store <4 x i8> %vec2, ptr addrspace(1) %dst, align 4
+  ret void
+}

>From 6d4aa3918047655a595d8da9e26b1942d45107d4 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Fri, 19 Apr 2024 09:51:25 -0700
Subject: [PATCH 03/11] replace auto

Change-Id: I1b461e3194a27e5e3c45500cae0ef5d4d6540d59
---
 .../AMDGPU/AMDGPULateCodeGenPrepare.cpp       | 58 +++++++++----------
 1 file changed, 29 insertions(+), 29 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
index f19f145ae86064..927c5f1506ae82 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
@@ -185,7 +185,7 @@ bool AMDGPULateCodeGenPrepare::runOnFunction(Function &F) {
   // cases, vectors of illegal types will be scalarized and widened, with each
   // scalar living in its own physical register. The optimization converts the
   // vectors to equivalent vectors of legal type (which are convereted back
-  // before uses in subsequenmt blocks), to pack the bits into fewer physical
+  // before uses in subsequent blocks), to pack the bits into fewer physical
   // registers (used in CopyToReg/CopyFromReg pairs).
   LiveRegOptimizer LRO(Mod);
 
@@ -221,7 +221,7 @@ bool LiveRegOptimizer::replaceUses(Instruction &I) {
   FromLRC.setNewType(getCompatibleType(FromLRC.getLiveRegDef()));
   for (auto IUser = I.user_begin(); IUser != I.user_end(); IUser++) {
 
-    if (auto UserInst = dyn_cast<Instruction>(*IUser)) {
+    if (Instruction *UserInst = dyn_cast<Instruction>(*IUser)) {
       if (UserInst->getParent() != I.getParent() || isa<PHINode>(UserInst)) {
         LLVM_DEBUG(dbgs() << *UserInst << "\n\tUses "
                           << *FromLRC.getOriginalType()
@@ -233,9 +233,9 @@ bool LiveRegOptimizer::replaceUses(Instruction &I) {
         // only replace the PHI node once we have converted all the operands
         if (auto PhiInst = dyn_cast<PHINode>(UserInst)) {
           for (unsigned Idx = 0; Idx < PhiInst->getNumIncomingValues(); Idx++) {
-            auto IncVal = PhiInst->getIncomingValue(Idx);
+            Value *IncVal = PhiInst->getIncomingValue(Idx);
             if (&I == dyn_cast<Instruction>(IncVal)) {
-              auto IncBlock = PhiInst->getIncomingBlock(Idx);
+              BasicBlock *IncBlock = PhiInst->getIncomingBlock(Idx);
               auto PHIOps = find_if(
                   PHIUpdater,
                   [&UserInst](
@@ -322,17 +322,17 @@ bool LiveRegOptimizer::replacePHIs() {
 }
 
 Type *LiveRegOptimizer::getCompatibleType(Instruction *InstToConvert) {
-  auto OriginalType = InstToConvert->getType();
+  Type *OriginalType = InstToConvert->getType();
   assert(OriginalType->getScalarSizeInBits() <=
          ConvertToScalar->getScalarSizeInBits());
-  auto VTy = dyn_cast<VectorType>(OriginalType);
+  VectorType *VTy = dyn_cast<VectorType>(OriginalType);
   if (!VTy)
     return ConvertToScalar;
 
-  auto OriginalSize =
+  unsigned OriginalSize =
       VTy->getScalarSizeInBits() * VTy->getElementCount().getFixedValue();
-  auto ConvertScalarSize = ConvertToScalar->getScalarSizeInBits();
-  auto ConvertEltCount =
+  unsigned ConvertScalarSize = ConvertToScalar->getScalarSizeInBits();
+  unsigned ConvertEltCount =
       (OriginalSize + ConvertScalarSize - 1) / ConvertScalarSize;
 
   if (OriginalSize <= ConvertScalarSize)
@@ -348,16 +348,16 @@ void LiveRegOptimizer::convertToOptType(LiveRegConversion &LR) {
     return;
   }
 
-  auto VTy = cast<VectorType>(LR.getOriginalType());
+  VectorType *VTy = cast<VectorType>(LR.getOriginalType());
 
-  auto NewTy = LR.getNewType();
+  Type *NewTy = LR.getNewType();
   assert(NewTy);
-  auto NewVTy = NewTy->isVectorTy() ? cast<VectorType>(NewTy) : nullptr;
+  VectorType *NewVTy = NewTy->isVectorTy() ? cast<VectorType>(NewTy) : nullptr;
 
-  auto V = static_cast<Value *>(LR.getLiveRegDef());
-  auto OriginalSize =
+  Value *V = static_cast<Value *>(LR.getLiveRegDef());
+  unsigned OriginalSize =
       VTy->getScalarSizeInBits() * VTy->getElementCount().getFixedValue();
-  auto NewSize = NewTy->isVectorTy()
+  unsigned NewSize = NewTy->isVectorTy()
                      ? NewVTy->getScalarSizeInBits() *
                            NewVTy->getElementCount().getFixedValue()
                      : NewTy->getScalarSizeInBits();
@@ -375,7 +375,7 @@ void LiveRegOptimizer::convertToOptType(LiveRegConversion &LR) {
 
   // If there is a bitsize mismatch, we must use a wider vector
   assert(NewSize > OriginalSize);
-  auto ExpandedVecElementCount =
+  ElementCount ExpandedVecElementCount =
       llvm::ElementCount::getFixed(NewSize / VTy->getScalarSizeInBits());
 
   SmallVector<int, 8> ShuffleMask;
@@ -396,18 +396,18 @@ void LiveRegOptimizer::convertToOptType(LiveRegConversion &LR) {
 }
 
 void LiveRegOptimizer::convertFromOptType(LiveRegConversion &LRC) {
-  auto OTy = LRC.getOriginalType();
-  auto VTy =
+  Type *OTy = LRC.getOriginalType();
+  VectorType *VTy =
       OTy->isVectorTy() ? dyn_cast<VectorType>(LRC.getOriginalType()) : nullptr;
 
-  auto NewVTy = cast<VectorType>(LRC.getNewType());
+  VectorType *NewVTy = cast<VectorType>(LRC.getNewType());
 
-  auto V = static_cast<Value *>(LRC.getLiveRegDef());
-  auto OriginalSize =
+  Value *V = static_cast<Value *>(LRC.getLiveRegDef());
+  unsigned OriginalSize =
       OTy->isVectorTy()
           ? VTy->getScalarSizeInBits() * VTy->getElementCount().getFixedValue()
           : OTy->getScalarSizeInBits();
-  auto NewSize =
+  unsigned NewSize =
       NewVTy->getScalarSizeInBits() * NewVTy->getElementCount().getFixedValue();
 
   auto &Builder = LRC.getConvertBuilder();
@@ -433,20 +433,20 @@ void LiveRegOptimizer::convertFromOptType(LiveRegConversion &LRC) {
   // If there is a bitsize mismatch, we have used a wider vector and must strip
   // the MSBs to convert back to the original type
   assert(OriginalSize > NewSize);
-  auto ExpandedVecElementCount = llvm::ElementCount::getFixed(
+  ElementCount ExpandedVecElementCount = llvm::ElementCount::getFixed(
       OriginalSize / NewVTy->getScalarSizeInBits());
-  auto ExpandedVT = VectorType::get(
+  VectorType *ExpandedVT = VectorType::get(
       Type::getIntNTy(Mod->getContext(), NewVTy->getScalarSizeInBits()),
       ExpandedVecElementCount);
-  auto Converted = dyn_cast<Instruction>(
+  Instruction *Converted = dyn_cast<Instruction>(
       Builder.CreateBitCast(LRC.getLiveRegDef(), ExpandedVT));
 
-  auto NarrowElementCount = NewVTy->getElementCount().getFixedValue();
+  unsigned NarrowElementCount = NewVTy->getElementCount().getFixedValue();
   SmallVector<int, 8> ShuffleMask;
   for (uint64_t I = 0; I < NarrowElementCount; I++)
     ShuffleMask.push_back(I);
 
-  auto NarrowVec = dyn_cast<Instruction>(
+  Instruction *NarrowVec = dyn_cast<Instruction>(
       Builder.CreateShuffleVector(Converted, ShuffleMask));
   LRC.setConverted(dyn_cast<Instruction>(NarrowVec));
   LLVM_DEBUG(dbgs() << "\tProduced for user: " << **LRC.getConverted() << "\n");
@@ -457,7 +457,7 @@ bool LiveRegOptimizer::shouldReplaceUses(const Instruction &I) {
   // Vectors of illegal types are copied across blocks in an efficient manner.
   // They are scalarized and widened to legal scalars. In such cases, we can do
   // better by using legal vector types
-  auto IType = I.getType();
+  Type *IType = I.getType();
   return IType->isVectorTy() && IType->getScalarSizeInBits() < 16 &&
          !I.getType()->getScalarType()->isPointerTy();
 }
@@ -471,7 +471,7 @@ bool AMDGPULateCodeGenPrepare::canWidenScalarExtLoad(LoadInst &LI) const {
   // Skip non-simple loads.
   if (!LI.isSimple())
     return false;
-  auto *Ty = LI.getType();
+  Type *Ty = LI.getType();
   // Skip aggregate types.
   if (Ty->isAggregateType())
     return false;

>From 85493c0a1df5d90baef912062cd67cfd5a19dc52 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Fri, 19 Apr 2024 10:15:38 -0700
Subject: [PATCH 04/11] Delete std::optional usage

Change-Id: Ia56d86e1acf191d19f6fc43ae780de9bb5118ba9
---
 .../AMDGPU/AMDGPULateCodeGenPrepare.cpp       | 28 +++++++++----------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
index 927c5f1506ae82..070fefc31132c9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
@@ -92,7 +92,7 @@ class LiveRegConversion {
   Type *NewType;
   // The instruction sequence that converts the virtual register, to be used
   // instead of the original
-  std::optional<Instruction *> Converted;
+  Instruction *Converted = nullptr;
   // The builder used to build the conversion instruction
   IRBuilder<> ConvertBuilder;
 
@@ -107,13 +107,13 @@ class LiveRegConversion {
   void setNewType(Type *NewType) { this->NewType = NewType; }
   // The instruction that conerts the virtual register, to be used instead of
   // the original
-  std::optional<Instruction *> &getConverted() { return Converted; }
+  Instruction *getConverted() { return Converted; }
   void setConverted(Instruction *Converted) { this->Converted = Converted; }
   // The builder used to build the conversion instruction
   IRBuilder<> &getConvertBuilder() { return ConvertBuilder; }
   // Do we have a instruction sequence which convert the original virtual
   // register
-  bool hasConverted() { return Converted.has_value(); }
+  bool hasConverted() { return Converted != nullptr; }
 
   LiveRegConversion(Instruction *LiveRegDef, BasicBlock *InsertBlock,
                     BasicBlock::iterator InsertPt)
@@ -246,9 +246,9 @@ bool LiveRegOptimizer::replaceUses(Instruction &I) {
 
               if (PHIOps == PHIUpdater.end())
                 PHIUpdater.push_back(
-                    {UserInst, {{*FromLRC.getConverted(), IncBlock}}});
+                    {UserInst, {{FromLRC.getConverted(), IncBlock}}});
               else
-                PHIOps->second.push_back({*FromLRC.getConverted(), IncBlock});
+                PHIOps->second.push_back({FromLRC.getConverted(), IncBlock});
 
               break;
             }
@@ -264,13 +264,13 @@ bool LiveRegOptimizer::replaceUses(Instruction &I) {
           continue;
         }
 
-        LiveRegConversion ToLRC(*FromLRC.getConverted(), I.getType(),
+        LiveRegConversion ToLRC(FromLRC.getConverted(), I.getType(),
                                 UserInst->getParent(),
                                 static_cast<BasicBlock::iterator>(
                                     UserInst->getParent()->getFirstNonPHIIt()));
         convertFromOptType(ToLRC);
         assert(ToLRC.hasConverted());
-        UseConvertTracker[UserInst->getParent()] = {*ToLRC.getConverted(),
+        UseConvertTracker[UserInst->getParent()] = {ToLRC.getConverted(),
                                                     {UserInst}};
       }
     }
@@ -312,7 +312,7 @@ bool LiveRegOptimizer::replacePHIs() {
                                   ThePHINode->getParent()->getFirstNonPHIIt()));
       convertFromOptType(ToLRC);
       assert(ToLRC.hasConverted());
-      Ele.first->replaceAllUsesWith(*ToLRC.getConverted());
+      Ele.first->replaceAllUsesWith(ToLRC.getConverted());
       // The old PHI is no longer used
       ThePHINode->eraseFromParent();
       MadeChange = true;
@@ -368,8 +368,8 @@ void LiveRegOptimizer::convertToOptType(LiveRegConversion &LR) {
   // desired type
   if (OriginalSize == NewSize) {
     LR.setConverted(dyn_cast<Instruction>(Builder.CreateBitCast(V, NewTy)));
-    LLVM_DEBUG(dbgs() << "\tConverted def to "
-                      << *(*LR.getConverted())->getType() << "\n");
+    LLVM_DEBUG(dbgs() << "\tConverted def to " << *LR.getConverted()->getType()
+                      << "\n");
     return;
   }
 
@@ -390,7 +390,7 @@ void LiveRegOptimizer::convertToOptType(LiveRegConversion &LR) {
       dyn_cast<Instruction>(Builder.CreateShuffleVector(V, ShuffleMask));
   LR.setConverted(
       dyn_cast<Instruction>(Builder.CreateBitCast(ExpandedVec, NewTy)));
-  LLVM_DEBUG(dbgs() << "\tConverted def to " << *(*LR.getConverted())->getType()
+  LLVM_DEBUG(dbgs() << "\tConverted def to " << *LR.getConverted()->getType()
                     << "\n");
   return;
 }
@@ -415,7 +415,7 @@ void LiveRegOptimizer::convertFromOptType(LiveRegConversion &LRC) {
   // If there is a bitsize match, we simply convert back to the original type
   if (OriginalSize == NewSize) {
     LRC.setConverted(dyn_cast<Instruction>(Builder.CreateBitCast(V, NewVTy)));
-    LLVM_DEBUG(dbgs() << "\tProduced for user: " << **LRC.getConverted()
+    LLVM_DEBUG(dbgs() << "\tProduced for user: " << *LRC.getConverted()
                       << "\n");
     return;
   }
@@ -425,7 +425,7 @@ void LiveRegOptimizer::convertFromOptType(LiveRegConversion &LRC) {
         LRC.getLiveRegDef(), IntegerType::get(Mod->getContext(), NewSize)));
     auto Original = dyn_cast<Instruction>(Builder.CreateBitCast(Trunc, NewVTy));
     LRC.setConverted(dyn_cast<Instruction>(Original));
-    LLVM_DEBUG(dbgs() << "\tProduced for user: " << **LRC.getConverted()
+    LLVM_DEBUG(dbgs() << "\tProduced for user: " << *LRC.getConverted()
                       << "\n");
     return;
   }
@@ -449,7 +449,7 @@ void LiveRegOptimizer::convertFromOptType(LiveRegConversion &LRC) {
   Instruction *NarrowVec = dyn_cast<Instruction>(
       Builder.CreateShuffleVector(Converted, ShuffleMask));
   LRC.setConverted(dyn_cast<Instruction>(NarrowVec));
-  LLVM_DEBUG(dbgs() << "\tProduced for user: " << **LRC.getConverted() << "\n");
+  LLVM_DEBUG(dbgs() << "\tProduced for user: " << *LRC.getConverted() << "\n");
   return;
 }
 

>From 647885fabec1ae5d6d552196fd277b642ebe0bae Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Fri, 19 Apr 2024 10:27:06 -0700
Subject: [PATCH 05/11] query size instead of calculation

Change-Id: I8eeacb7d4292a215bb0540e8e7dd12ab7547d058
---
 .../AMDGPU/AMDGPULateCodeGenPrepare.cpp       | 31 +++++--------------
 1 file changed, 7 insertions(+), 24 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
index 070fefc31132c9..5276598efeb6f8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
@@ -329,8 +329,7 @@ Type *LiveRegOptimizer::getCompatibleType(Instruction *InstToConvert) {
   if (!VTy)
     return ConvertToScalar;
 
-  unsigned OriginalSize =
-      VTy->getScalarSizeInBits() * VTy->getElementCount().getFixedValue();
+  unsigned OriginalSize = VTy->getPrimitiveSizeInBits();
   unsigned ConvertScalarSize = ConvertToScalar->getScalarSizeInBits();
   unsigned ConvertEltCount =
       (OriginalSize + ConvertScalarSize - 1) / ConvertScalarSize;
@@ -349,21 +348,13 @@ void LiveRegOptimizer::convertToOptType(LiveRegConversion &LR) {
   }
 
   VectorType *VTy = cast<VectorType>(LR.getOriginalType());
-
   Type *NewTy = LR.getNewType();
-  assert(NewTy);
-  VectorType *NewVTy = NewTy->isVectorTy() ? cast<VectorType>(NewTy) : nullptr;
 
-  Value *V = static_cast<Value *>(LR.getLiveRegDef());
-  unsigned OriginalSize =
-      VTy->getScalarSizeInBits() * VTy->getElementCount().getFixedValue();
-  unsigned NewSize = NewTy->isVectorTy()
-                     ? NewVTy->getScalarSizeInBits() *
-                           NewVTy->getElementCount().getFixedValue()
-                     : NewTy->getScalarSizeInBits();
+  unsigned OriginalSize = VTy->getPrimitiveSizeInBits();
+  unsigned NewSize = NewTy->getPrimitiveSizeInBits();
 
   auto &Builder = LR.getConvertBuilder();
-
+  Value *V = static_cast<Value *>(LR.getLiveRegDef());
   // If there is a bitsize match, we can fit the old vector into a new vector of
   // desired type
   if (OriginalSize == NewSize) {
@@ -397,21 +388,13 @@ void LiveRegOptimizer::convertToOptType(LiveRegConversion &LR) {
 
 void LiveRegOptimizer::convertFromOptType(LiveRegConversion &LRC) {
   Type *OTy = LRC.getOriginalType();
-  VectorType *VTy =
-      OTy->isVectorTy() ? dyn_cast<VectorType>(LRC.getOriginalType()) : nullptr;
-
   VectorType *NewVTy = cast<VectorType>(LRC.getNewType());
 
-  Value *V = static_cast<Value *>(LRC.getLiveRegDef());
-  unsigned OriginalSize =
-      OTy->isVectorTy()
-          ? VTy->getScalarSizeInBits() * VTy->getElementCount().getFixedValue()
-          : OTy->getScalarSizeInBits();
-  unsigned NewSize =
-      NewVTy->getScalarSizeInBits() * NewVTy->getElementCount().getFixedValue();
+  unsigned OriginalSize = OTy->getPrimitiveSizeInBits();
+  unsigned NewSize = NewVTy->getPrimitiveSizeInBits();
 
   auto &Builder = LRC.getConvertBuilder();
-
+  Value *V = static_cast<Value *>(LRC.getLiveRegDef());
   // If there is a bitsize match, we simply convert back to the original type
   if (OriginalSize == NewSize) {
     LRC.setConverted(dyn_cast<Instruction>(Builder.CreateBitCast(V, NewVTy)));

>From 95ee7a5fe04956a91daccf2d1a74a513a4273eb7 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Fri, 19 Apr 2024 10:36:59 -0700
Subject: [PATCH 06/11] rename LiveRegConversion

Change-Id: I94504f26819c45de7496b39fee8031bcda0f29fb
---
 .../AMDGPU/AMDGPULateCodeGenPrepare.cpp       | 72 ++++++++++---------
 1 file changed, 37 insertions(+), 35 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
index 5276598efeb6f8..31eedcfe6dee9e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
@@ -81,7 +81,7 @@ class AMDGPULateCodeGenPrepare
   bool visitLoadInst(LoadInst &LI);
 };
 
-class LiveRegConversion {
+class ConversionCandidateInfo {
 private:
   // The instruction which defined the original virtual register used across
   // blocks
@@ -115,12 +115,13 @@ class LiveRegConversion {
   // register
   bool hasConverted() { return Converted != nullptr; }
 
-  LiveRegConversion(Instruction *LiveRegDef, BasicBlock *InsertBlock,
-                    BasicBlock::iterator InsertPt)
+  ConversionCandidateInfo(Instruction *LiveRegDef, BasicBlock *InsertBlock,
+                          BasicBlock::iterator InsertPt)
       : LiveRegDef(LiveRegDef), OriginalType(LiveRegDef->getType()),
         ConvertBuilder(InsertBlock, InsertPt) {}
-  LiveRegConversion(Instruction *LiveRegDef, Type *NewType,
-                    BasicBlock *InsertBlock, BasicBlock::iterator InsertPt)
+  ConversionCandidateInfo(Instruction *LiveRegDef, Type *NewType,
+                          BasicBlock *InsertBlock,
+                          BasicBlock::iterator InsertPt)
       : LiveRegDef(LiveRegDef), OriginalType(LiveRegDef->getType()),
         NewType(NewType), ConvertBuilder(InsertBlock, InsertPt) {}
 };
@@ -140,10 +141,10 @@ class LiveRegOptimizer {
   // Should the def of the instruction be converted if it is live across blocks
   bool shouldReplaceUses(const Instruction &I);
   // Convert the virtual register to the compatible vector of legal type
-  void convertToOptType(LiveRegConversion &LR);
+  void convertToOptType(ConversionCandidateInfo &LR);
   // Convert the virtual register back to the original type, stripping away
   // the MSBs in cases where there was an imperfect fit (e.g. v2i32 -> v7i8)
-  void convertFromOptType(LiveRegConversion &LR);
+  void convertFromOptType(ConversionCandidateInfo &LR);
   // Get a vector of desired scalar type that is compatible with the original
   // vector. In cases where there is no bitsize equivalent using a legal vector
   // type, we pad the MSBs (e.g. v7i8 -> v2i32)
@@ -213,21 +214,21 @@ bool LiveRegOptimizer::replaceUses(Instruction &I) {
     Instruction *Converted;
     SmallVector<Instruction *, 4> Users;
   };
-  DenseMap<BasicBlock *, ConvertUseInfo> UseConvertTracker;
+  DenseMap<BasicBlock *, ConvertUseInfo> InsertedConversionMap;
 
-  LiveRegConversion FromLRC(
+  ConversionCandidateInfo FromCCI(
       &I, I.getParent(),
       static_cast<BasicBlock::iterator>(std::next(I.getIterator())));
-  FromLRC.setNewType(getCompatibleType(FromLRC.getLiveRegDef()));
+  FromCCI.setNewType(getCompatibleType(FromCCI.getLiveRegDef()));
   for (auto IUser = I.user_begin(); IUser != I.user_end(); IUser++) {
 
     if (Instruction *UserInst = dyn_cast<Instruction>(*IUser)) {
       if (UserInst->getParent() != I.getParent() || isa<PHINode>(UserInst)) {
         LLVM_DEBUG(dbgs() << *UserInst << "\n\tUses "
-                          << *FromLRC.getOriginalType()
+                          << *FromCCI.getOriginalType()
                           << " from previous block. Needs conversion\n");
-        convertToOptType(FromLRC);
-        if (!FromLRC.hasConverted())
+        convertToOptType(FromCCI);
+        if (!FromCCI.hasConverted())
           continue;
         // If it is a PHI node, just create and collect the new operand. We can
         // only replace the PHI node once we have converted all the operands
@@ -246,9 +247,9 @@ bool LiveRegOptimizer::replaceUses(Instruction &I) {
 
               if (PHIOps == PHIUpdater.end())
                 PHIUpdater.push_back(
-                    {UserInst, {{FromLRC.getConverted(), IncBlock}}});
+                    {UserInst, {{FromCCI.getConverted(), IncBlock}}});
               else
-                PHIOps->second.push_back({FromLRC.getConverted(), IncBlock});
+                PHIOps->second.push_back({FromCCI.getConverted(), IncBlock});
 
               break;
             }
@@ -258,27 +259,28 @@ bool LiveRegOptimizer::replaceUses(Instruction &I) {
 
         // Do not create multiple conversion sequences if there are multiple
         // uses in the same block
-        if (UseConvertTracker.contains(UserInst->getParent())) {
-          UseConvertTracker[UserInst->getParent()].Users.push_back(UserInst);
+        if (InsertedConversionMap.contains(UserInst->getParent())) {
+          InsertedConversionMap[UserInst->getParent()].Users.push_back(
+              UserInst);
           LLVM_DEBUG(dbgs() << "\tUser already has access to converted def\n");
           continue;
         }
 
-        LiveRegConversion ToLRC(FromLRC.getConverted(), I.getType(),
-                                UserInst->getParent(),
-                                static_cast<BasicBlock::iterator>(
-                                    UserInst->getParent()->getFirstNonPHIIt()));
-        convertFromOptType(ToLRC);
-        assert(ToLRC.hasConverted());
-        UseConvertTracker[UserInst->getParent()] = {ToLRC.getConverted(),
-                                                    {UserInst}};
+        ConversionCandidateInfo ToCCI(
+            FromCCI.getConverted(), I.getType(), UserInst->getParent(),
+            static_cast<BasicBlock::iterator>(
+                UserInst->getParent()->getFirstNonPHIIt()));
+        convertFromOptType(ToCCI);
+        assert(ToCCI.hasConverted());
+        InsertedConversionMap[UserInst->getParent()] = {ToCCI.getConverted(),
+                                                        {UserInst}};
       }
     }
   }
 
   // Replace uses of with in a separate loop that is not dependent upon the
   // state of the uses
-  for (auto &Entry : UseConvertTracker) {
+  for (auto &Entry : InsertedConversionMap) {
     for (auto &UserInst : Entry.second.Users) {
       LLVM_DEBUG(dbgs() << *UserInst
                         << "\n\tNow uses: " << *Entry.second.Converted << "\n");
@@ -306,13 +308,13 @@ bool LiveRegOptimizer::replacePHIs() {
                           << "  For: " << IncVals.second->getName() << "\n");
       }
       LLVM_DEBUG(dbgs() << "Sucessfully replaced with " << *NPHI << "\n");
-      LiveRegConversion ToLRC(NPHI, ThePHINode->getType(),
-                              ThePHINode->getParent(),
-                              static_cast<BasicBlock::iterator>(
-                                  ThePHINode->getParent()->getFirstNonPHIIt()));
-      convertFromOptType(ToLRC);
-      assert(ToLRC.hasConverted());
-      Ele.first->replaceAllUsesWith(ToLRC.getConverted());
+      ConversionCandidateInfo ToCCI(
+          NPHI, ThePHINode->getType(), ThePHINode->getParent(),
+          static_cast<BasicBlock::iterator>(
+              ThePHINode->getParent()->getFirstNonPHIIt()));
+      convertFromOptType(ToCCI);
+      assert(ToCCI.hasConverted());
+      Ele.first->replaceAllUsesWith(ToCCI.getConverted());
       // The old PHI is no longer used
       ThePHINode->eraseFromParent();
       MadeChange = true;
@@ -341,7 +343,7 @@ Type *LiveRegOptimizer::getCompatibleType(Instruction *InstToConvert) {
                          llvm::ElementCount::getFixed(ConvertEltCount));
 }
 
-void LiveRegOptimizer::convertToOptType(LiveRegConversion &LR) {
+void LiveRegOptimizer::convertToOptType(ConversionCandidateInfo &LR) {
   if (LR.hasConverted()) {
     LLVM_DEBUG(dbgs() << "\tAlready has converted def\n");
     return;
@@ -386,7 +388,7 @@ void LiveRegOptimizer::convertToOptType(LiveRegConversion &LR) {
   return;
 }
 
-void LiveRegOptimizer::convertFromOptType(LiveRegConversion &LRC) {
+void LiveRegOptimizer::convertFromOptType(ConversionCandidateInfo &LRC) {
   Type *OTy = LRC.getOriginalType();
   VectorType *NewVTy = cast<VectorType>(LRC.getNewType());
 

>From 7fe461c3f49902b638bc4fd01ccd7d0f97ff9f53 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Fri, 19 Apr 2024 10:46:58 -0700
Subject: [PATCH 07/11] simplify initialization of shufflemask vector

Change-Id: I4383004240dc0365de6e67b12dc9ea5b609826d2
---
 llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
index 31eedcfe6dee9e..d0e0977d7bb4e5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
@@ -427,9 +427,8 @@ void LiveRegOptimizer::convertFromOptType(ConversionCandidateInfo &LRC) {
       Builder.CreateBitCast(LRC.getLiveRegDef(), ExpandedVT));
 
   unsigned NarrowElementCount = NewVTy->getElementCount().getFixedValue();
-  SmallVector<int, 8> ShuffleMask;
-  for (uint64_t I = 0; I < NarrowElementCount; I++)
-    ShuffleMask.push_back(I);
+  SmallVector<int, 8> ShuffleMask(NarrowElementCount);
+  std::iota(ShuffleMask.begin(), ShuffleMask.end(), 0);
 
   Instruction *NarrowVec = dyn_cast<Instruction>(
       Builder.CreateShuffleVector(Converted, ShuffleMask));

>From 09882483cc3922a797bc43440f52898c19817f39 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Fri, 19 Apr 2024 10:56:36 -0700
Subject: [PATCH 08/11] precommit global-isel tests

Change-Id: I07bf0cf4537bd3b148dc4ee3b785b989f0aac8b0
---
 .../AMDGPU/GlobalISel/vni8-across-blocks.ll   | 2120 +++++++++++++++++
 .../AMDGPU/GlobalISel/vni8-loop-carried.ll    |   67 +
 2 files changed, 2187 insertions(+)
 create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-loop-carried.ll

diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll
new file mode 100644
index 00000000000000..cbb8fede31efa7
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll
@@ -0,0 +1,2120 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx906 < %s | FileCheck --check-prefix=GFX906 %s
+
+define amdgpu_kernel void @v3i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
+; GFX906-LABEL: v3i8_liveout:
+; GFX906:       ; %bb.0: ; %entry
+; GFX906-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX906-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX906-NEXT:    v_lshlrev_b32_e32 v4, 2, v0
+; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v0
+; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX906-NEXT:    global_load_dword v1, v4, s[4:5]
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v1
+; GFX906-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GFX906-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX906-NEXT:    s_cbranch_execz .LBB0_2
+; GFX906-NEXT:  ; %bb.1: ; %bb.1
+; GFX906-NEXT:    global_load_dword v1, v4, s[6:7]
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v1
+; GFX906-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GFX906-NEXT:  .LBB0_2: ; %bb.2
+; GFX906-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX906-NEXT:    v_and_b32_e32 v0, 0xff, v3
+; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
+; GFX906-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    s_mov_b32 s0, 0xffff
+; GFX906-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX906-NEXT:    v_and_b32_sdwa v1, s0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX906-NEXT:    v_mov_b32_e32 v1, 0
+; GFX906-NEXT:    global_store_short v1, v0, s[2:3]
+; GFX906-NEXT:    global_store_byte_d16_hi v1, v0, s[2:3] offset:2
+; GFX906-NEXT:    s_endpgm
+entry:
+  %idx = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep1 = getelementptr <3 x i8>, ptr addrspace(1) %src1, i32 %idx
+  %vec1 = load <3 x i8>, ptr addrspace(1) %gep1
+  %gep2 = getelementptr <3 x i8>, ptr addrspace(1) %src2, i32 %idx
+  %vec2 = load <3 x i8>, ptr addrspace(1) %gep2
+  %cmp = icmp ult i32 %idx, 15
+  br i1 %cmp, label %bb.1, label %bb.2
+bb.1:
+  br label %bb.2
+
+bb.2:
+  %tmp5 = phi <3 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ]
+  store <3 x i8> %tmp5, ptr addrspace(1) %dst, align 4
+  ret void
+}
+
+define amdgpu_kernel void @v4i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
+; GFX906-LABEL: v4i8_liveout:
+; GFX906:       ; %bb.0: ; %entry
+; GFX906-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX906-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 2, v0
+; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v0
+; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX906-NEXT:    global_load_dword v1, v5, s[4:5]
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v2, 8, v1
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 24, v1
+; GFX906-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX906-NEXT:    s_cbranch_execz .LBB1_2
+; GFX906-NEXT:  ; %bb.1: ; %bb.1
+; GFX906-NEXT:    global_load_dword v1, v5, s[6:7]
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v2, 8, v1
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 24, v1
+; GFX906-NEXT:  .LBB1_2: ; %bb.2
+; GFX906-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX906-NEXT:    v_mov_b32_e32 v5, 8
+; GFX906-NEXT:    v_mov_b32_e32 v0, 0xff
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    v_and_or_b32 v0, v1, v0, v2
+; GFX906-NEXT:    v_and_b32_e32 v1, 0xff, v3
+; GFX906-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX906-NEXT:    v_lshlrev_b32_e32 v2, 24, v4
+; GFX906-NEXT:    v_or3_b32 v0, v0, v1, v2
+; GFX906-NEXT:    v_mov_b32_e32 v1, 0
+; GFX906-NEXT:    global_store_dword v1, v0, s[2:3]
+; GFX906-NEXT:    s_endpgm
+entry:
+  %idx = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
+  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
+  %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
+  %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
+  %cmp = icmp ult i32 %idx, 15
+  br i1 %cmp, label %bb.1, label %bb.2
+bb.1:
+  br label %bb.2
+
+bb.2:
+  %tmp5 = phi <4 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ]
+  store <4 x i8> %tmp5, ptr addrspace(1) %dst, align 4
+  ret void
+}
+
+define amdgpu_kernel void @v5i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
+; GFX906-LABEL: v5i8_liveout:
+; GFX906:       ; %bb.0: ; %entry
+; GFX906-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX906-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX906-NEXT:    v_lshlrev_b32_e32 v6, 3, v0
+; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v0
+; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX906-NEXT:    global_load_dwordx2 v[1:2], v6, s[4:5]
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v1
+; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
+; GFX906-NEXT:    v_lshrrev_b32_e32 v5, 24, v1
+; GFX906-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX906-NEXT:    s_cbranch_execz .LBB2_2
+; GFX906-NEXT:  ; %bb.1: ; %bb.1
+; GFX906-NEXT:    global_load_dwordx2 v[1:2], v6, s[6:7]
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v1
+; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
+; GFX906-NEXT:    v_lshrrev_b32_e32 v5, 24, v1
+; GFX906-NEXT:  .LBB2_2: ; %bb.2
+; GFX906-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX906-NEXT:    v_mov_b32_e32 v0, 0
+; GFX906-NEXT:    global_store_byte v0, v1, s[2:3]
+; GFX906-NEXT:    global_store_byte v0, v3, s[2:3] offset:1
+; GFX906-NEXT:    global_store_byte v0, v4, s[2:3] offset:2
+; GFX906-NEXT:    global_store_byte v0, v5, s[2:3] offset:3
+; GFX906-NEXT:    global_store_byte v0, v2, s[2:3] offset:4
+; GFX906-NEXT:    s_endpgm
+entry:
+  %idx = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep1 = getelementptr <5 x i8>, ptr addrspace(1) %src1, i32 %idx
+  %vec1 = load <5 x i8>, ptr addrspace(1) %gep1
+  %gep2 = getelementptr <5 x i8>, ptr addrspace(1) %src2, i32 %idx
+  %vec2 = load <5 x i8>, ptr addrspace(1) %gep2
+  %cmp = icmp ult i32 %idx, 15
+  br i1 %cmp, label %bb.1, label %bb.2
+bb.1:
+  br label %bb.2
+
+bb.2:
+  %tmp5 = phi <5 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ]
+  store <5 x i8> %tmp5, ptr addrspace(1) %dst, align 4
+  ret void
+}
+
+define amdgpu_kernel void @v8i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
+; GFX906-LABEL: v8i8_liveout:
+; GFX906:       ; %bb.0: ; %entry
+; GFX906-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX906-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX906-NEXT:    v_lshlrev_b32_e32 v9, 3, v0
+; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v0
+; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX906-NEXT:    global_load_dwordx2 v[1:2], v9, s[4:5]
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v1
+; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
+; GFX906-NEXT:    v_lshrrev_b32_e32 v5, 24, v1
+; GFX906-NEXT:    v_lshrrev_b32_e32 v6, 8, v2
+; GFX906-NEXT:    v_lshrrev_b32_e32 v7, 16, v2
+; GFX906-NEXT:    v_lshrrev_b32_e32 v8, 24, v2
+; GFX906-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX906-NEXT:    s_cbranch_execz .LBB3_2
+; GFX906-NEXT:  ; %bb.1: ; %bb.1
+; GFX906-NEXT:    global_load_dwordx2 v[1:2], v9, s[6:7]
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v1
+; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
+; GFX906-NEXT:    v_lshrrev_b32_e32 v5, 24, v1
+; GFX906-NEXT:    v_lshrrev_b32_e32 v6, 8, v2
+; GFX906-NEXT:    v_lshrrev_b32_e32 v7, 16, v2
+; GFX906-NEXT:    v_lshrrev_b32_e32 v8, 24, v2
+; GFX906-NEXT:  .LBB3_2: ; %bb.2
+; GFX906-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX906-NEXT:    v_mov_b32_e32 v10, 8
+; GFX906-NEXT:    v_mov_b32_e32 v9, 0xff
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v0, v10, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    v_and_or_b32 v0, v1, v9, v0
+; GFX906-NEXT:    v_and_b32_e32 v1, 0xff, v4
+; GFX906-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX906-NEXT:    v_lshlrev_b32_e32 v3, 24, v5
+; GFX906-NEXT:    v_or3_b32 v0, v0, v1, v3
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v1, v10, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    v_and_or_b32 v1, v2, v9, v1
+; GFX906-NEXT:    v_and_b32_e32 v2, 0xff, v7
+; GFX906-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX906-NEXT:    v_lshlrev_b32_e32 v3, 24, v8
+; GFX906-NEXT:    v_or3_b32 v1, v1, v2, v3
+; GFX906-NEXT:    v_mov_b32_e32 v2, 0
+; GFX906-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX906-NEXT:    s_endpgm
+entry:
+  %idx = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx
+  %vec1 = load <8 x i8>, ptr addrspace(1) %gep1
+  %gep2 = getelementptr <8 x i8>, ptr addrspace(1) %src2, i32 %idx
+  %vec2 = load <8 x i8>, ptr addrspace(1) %gep2
+  %cmp = icmp ult i32 %idx, 15
+  br i1 %cmp, label %bb.1, label %bb.2
+bb.1:
+  br label %bb.2
+
+bb.2:
+  %tmp5 = phi <8 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ]
+  store <8 x i8> %tmp5, ptr addrspace(1) %dst, align 4
+  ret void
+}
+
+define amdgpu_kernel void @v16i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
+; GFX906-LABEL: v16i8_liveout:
+; GFX906:       ; %bb.0: ; %entry
+; GFX906-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX906-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX906-NEXT:    v_lshlrev_b32_e32 v17, 4, v0
+; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v0
+; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX906-NEXT:    global_load_dwordx4 v[1:4], v17, s[4:5]
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
+; GFX906-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
+; GFX906-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
+; GFX906-NEXT:    v_lshrrev_b32_e32 v8, 8, v2
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v2
+; GFX906-NEXT:    v_lshrrev_b32_e32 v10, 24, v2
+; GFX906-NEXT:    v_lshrrev_b32_e32 v11, 8, v3
+; GFX906-NEXT:    v_lshrrev_b32_e32 v12, 16, v3
+; GFX906-NEXT:    v_lshrrev_b32_e32 v13, 24, v3
+; GFX906-NEXT:    v_lshrrev_b32_e32 v14, 8, v4
+; GFX906-NEXT:    v_lshrrev_b32_e32 v15, 16, v4
+; GFX906-NEXT:    v_lshrrev_b32_e32 v16, 24, v4
+; GFX906-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX906-NEXT:    s_cbranch_execz .LBB4_2
+; GFX906-NEXT:  ; %bb.1: ; %bb.1
+; GFX906-NEXT:    global_load_dwordx4 v[1:4], v17, s[6:7]
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
+; GFX906-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
+; GFX906-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
+; GFX906-NEXT:    v_lshrrev_b32_e32 v8, 8, v2
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v2
+; GFX906-NEXT:    v_lshrrev_b32_e32 v10, 24, v2
+; GFX906-NEXT:    v_lshrrev_b32_e32 v11, 8, v3
+; GFX906-NEXT:    v_lshrrev_b32_e32 v12, 16, v3
+; GFX906-NEXT:    v_lshrrev_b32_e32 v13, 24, v3
+; GFX906-NEXT:    v_lshrrev_b32_e32 v14, 8, v4
+; GFX906-NEXT:    v_lshrrev_b32_e32 v15, 16, v4
+; GFX906-NEXT:    v_lshrrev_b32_e32 v16, 24, v4
+; GFX906-NEXT:  .LBB4_2: ; %bb.2
+; GFX906-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX906-NEXT:    v_mov_b32_e32 v18, 8
+; GFX906-NEXT:    v_mov_b32_e32 v17, 0xff
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v0, v18, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    v_and_or_b32 v0, v1, v17, v0
+; GFX906-NEXT:    v_and_b32_e32 v1, 0xff, v6
+; GFX906-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 24, v7
+; GFX906-NEXT:    v_or3_b32 v0, v0, v1, v5
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v1, v18, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    v_and_or_b32 v1, v2, v17, v1
+; GFX906-NEXT:    v_and_b32_e32 v2, 0xff, v9
+; GFX906-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 24, v10
+; GFX906-NEXT:    v_or3_b32 v1, v1, v2, v5
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v2, v18, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    v_and_or_b32 v2, v3, v17, v2
+; GFX906-NEXT:    v_and_b32_e32 v3, 0xff, v12
+; GFX906-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 24, v13
+; GFX906-NEXT:    v_or3_b32 v2, v2, v3, v5
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v3, v18, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    v_and_or_b32 v3, v4, v17, v3
+; GFX906-NEXT:    v_and_b32_e32 v4, 0xff, v15
+; GFX906-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 24, v16
+; GFX906-NEXT:    v_or3_b32 v3, v3, v4, v5
+; GFX906-NEXT:    v_mov_b32_e32 v4, 0
+; GFX906-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
+; GFX906-NEXT:    s_endpgm
+entry:
+  %idx = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep1 = getelementptr <16 x i8>, ptr addrspace(1) %src1, i32 %idx
+  %vec1 = load <16 x i8>, ptr addrspace(1) %gep1
+  %gep2 = getelementptr <16 x i8>, ptr addrspace(1) %src2, i32 %idx
+  %vec2 = load <16 x i8>, ptr addrspace(1) %gep2
+  %cmp = icmp ult i32 %idx, 15
+  br i1 %cmp, label %bb.1, label %bb.2
+bb.1:
+  br label %bb.2
+
+bb.2:
+  %tmp5 = phi <16 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ]
+  store <16 x i8> %tmp5, ptr addrspace(1) %dst, align 4
+  ret void
+}
+
+define amdgpu_kernel void @v32i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
+; GFX906-LABEL: v32i8_liveout:
+; GFX906:       ; %bb.0: ; %entry
+; GFX906-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX906-NEXT:    v_lshlrev_b32_e32 v32, 5, v0
+; GFX906-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v0
+; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX906-NEXT:    global_load_dwordx4 v[5:8], v32, s[4:5]
+; GFX906-NEXT:    global_load_dwordx4 v[1:4], v32, s[4:5] offset:16
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v31, 8, v5
+; GFX906-NEXT:    v_lshrrev_b32_e32 v26, 16, v5
+; GFX906-NEXT:    v_lshrrev_b32_e32 v27, 24, v5
+; GFX906-NEXT:    v_lshrrev_b32_e32 v28, 8, v6
+; GFX906-NEXT:    v_lshrrev_b32_e32 v29, 16, v6
+; GFX906-NEXT:    v_lshrrev_b32_e32 v30, 24, v6
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v7
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v7
+; GFX906-NEXT:    v_lshrrev_b32_e32 v10, 24, v7
+; GFX906-NEXT:    v_lshrrev_b32_e32 v11, 8, v8
+; GFX906-NEXT:    v_lshrrev_b32_e32 v12, 16, v8
+; GFX906-NEXT:    v_lshrrev_b32_e32 v13, 24, v8
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v14, 8, v1
+; GFX906-NEXT:    v_lshrrev_b32_e32 v15, 16, v1
+; GFX906-NEXT:    v_lshrrev_b32_e32 v16, 24, v1
+; GFX906-NEXT:    v_lshrrev_b32_e32 v17, 8, v2
+; GFX906-NEXT:    v_lshrrev_b32_e32 v18, 16, v2
+; GFX906-NEXT:    v_lshrrev_b32_e32 v19, 24, v2
+; GFX906-NEXT:    v_lshrrev_b32_e32 v20, 8, v3
+; GFX906-NEXT:    v_lshrrev_b32_e32 v21, 16, v3
+; GFX906-NEXT:    v_lshrrev_b32_e32 v22, 24, v3
+; GFX906-NEXT:    v_lshrrev_b32_e32 v23, 8, v4
+; GFX906-NEXT:    v_lshrrev_b32_e32 v24, 16, v4
+; GFX906-NEXT:    v_lshrrev_b32_e32 v25, 24, v4
+; GFX906-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX906-NEXT:    s_cbranch_execz .LBB5_2
+; GFX906-NEXT:  ; %bb.1: ; %bb.1
+; GFX906-NEXT:    global_load_dwordx4 v[5:8], v32, s[6:7]
+; GFX906-NEXT:    global_load_dwordx4 v[1:4], v32, s[6:7] offset:16
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v31, 8, v5
+; GFX906-NEXT:    v_lshrrev_b32_e32 v26, 16, v5
+; GFX906-NEXT:    v_lshrrev_b32_e32 v27, 24, v5
+; GFX906-NEXT:    v_lshrrev_b32_e32 v28, 8, v6
+; GFX906-NEXT:    v_lshrrev_b32_e32 v29, 16, v6
+; GFX906-NEXT:    v_lshrrev_b32_e32 v30, 24, v6
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v7
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v7
+; GFX906-NEXT:    v_lshrrev_b32_e32 v10, 24, v7
+; GFX906-NEXT:    v_lshrrev_b32_e32 v11, 8, v8
+; GFX906-NEXT:    v_lshrrev_b32_e32 v12, 16, v8
+; GFX906-NEXT:    v_lshrrev_b32_e32 v13, 24, v8
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v14, 8, v1
+; GFX906-NEXT:    v_lshrrev_b32_e32 v15, 16, v1
+; GFX906-NEXT:    v_lshrrev_b32_e32 v16, 24, v1
+; GFX906-NEXT:    v_lshrrev_b32_e32 v17, 8, v2
+; GFX906-NEXT:    v_lshrrev_b32_e32 v18, 16, v2
+; GFX906-NEXT:    v_lshrrev_b32_e32 v19, 24, v2
+; GFX906-NEXT:    v_lshrrev_b32_e32 v20, 8, v3
+; GFX906-NEXT:    v_lshrrev_b32_e32 v21, 16, v3
+; GFX906-NEXT:    v_lshrrev_b32_e32 v22, 24, v3
+; GFX906-NEXT:    v_lshrrev_b32_e32 v23, 8, v4
+; GFX906-NEXT:    v_lshrrev_b32_e32 v24, 16, v4
+; GFX906-NEXT:    v_lshrrev_b32_e32 v25, 24, v4
+; GFX906-NEXT:  .LBB5_2: ; %bb.2
+; GFX906-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX906-NEXT:    v_mov_b32_e32 v32, 8
+; GFX906-NEXT:    v_mov_b32_e32 v33, 0xff
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v0, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    v_and_or_b32 v0, v7, v33, v0
+; GFX906-NEXT:    v_and_b32_e32 v7, 0xff, v9
+; GFX906-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX906-NEXT:    v_lshlrev_b32_e32 v9, 24, v10
+; GFX906-NEXT:    v_or3_b32 v7, v0, v7, v9
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v0, v32, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    v_and_or_b32 v0, v8, v33, v0
+; GFX906-NEXT:    v_and_b32_e32 v8, 0xff, v12
+; GFX906-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX906-NEXT:    v_lshlrev_b32_e32 v9, 24, v13
+; GFX906-NEXT:    v_or3_b32 v8, v0, v8, v9
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v0, v32, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    v_and_or_b32 v0, v1, v33, v0
+; GFX906-NEXT:    v_and_b32_e32 v1, 0xff, v15
+; GFX906-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX906-NEXT:    v_lshlrev_b32_e32 v9, 24, v16
+; GFX906-NEXT:    v_or3_b32 v0, v0, v1, v9
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v1, v32, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    v_and_or_b32 v1, v2, v33, v1
+; GFX906-NEXT:    v_and_b32_e32 v2, 0xff, v18
+; GFX906-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX906-NEXT:    v_lshlrev_b32_e32 v9, 24, v19
+; GFX906-NEXT:    v_or3_b32 v1, v1, v2, v9
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v2, v32, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    v_and_or_b32 v2, v3, v33, v2
+; GFX906-NEXT:    v_and_b32_e32 v3, 0xff, v21
+; GFX906-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX906-NEXT:    v_lshlrev_b32_e32 v9, 24, v22
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v28, v32, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    v_or3_b32 v2, v2, v3, v9
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v3, v32, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v31, v32, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    v_and_b32_e32 v26, 0xff, v26
+; GFX906-NEXT:    v_and_or_b32 v6, v6, v33, v28
+; GFX906-NEXT:    v_and_b32_e32 v28, 0xff, v29
+; GFX906-NEXT:    v_and_or_b32 v3, v4, v33, v3
+; GFX906-NEXT:    v_and_b32_e32 v4, 0xff, v24
+; GFX906-NEXT:    v_and_or_b32 v5, v5, v33, v31
+; GFX906-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
+; GFX906-NEXT:    v_lshlrev_b32_e32 v27, 24, v27
+; GFX906-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
+; GFX906-NEXT:    v_lshlrev_b32_e32 v29, 24, v30
+; GFX906-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX906-NEXT:    v_lshlrev_b32_e32 v9, 24, v25
+; GFX906-NEXT:    v_or3_b32 v5, v5, v26, v27
+; GFX906-NEXT:    v_or3_b32 v6, v6, v28, v29
+; GFX906-NEXT:    v_or3_b32 v3, v3, v4, v9
+; GFX906-NEXT:    v_mov_b32_e32 v4, 0
+; GFX906-NEXT:    global_store_dwordx4 v4, v[5:8], s[0:1]
+; GFX906-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
+; GFX906-NEXT:    s_endpgm
+entry:
+  %idx = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep1 = getelementptr <32 x i8>, ptr addrspace(1) %src1, i32 %idx
+  %vec1 = load <32 x i8>, ptr addrspace(1) %gep1
+  %gep2 = getelementptr <32 x i8>, ptr addrspace(1) %src2, i32 %idx
+  %vec2 = load <32 x i8>, ptr addrspace(1) %gep2
+  %cmp = icmp ult i32 %idx, 15
+  br i1 %cmp, label %bb.1, label %bb.2
+bb.1:
+  br label %bb.2
+
+bb.2:
+  %tmp5 = phi <32 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ]
+  store <32 x i8> %tmp5, ptr addrspace(1) %dst, align 4
+  ret void
+}
+
+define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
+; GFX906-LABEL: v256i8_liveout:
+; GFX906:       ; %bb.0: ; %entry
+; GFX906-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX906-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX906-NEXT:    s_mov_b32 s10, -1
+; GFX906-NEXT:    s_mov_b32 s11, 0xe00000
+; GFX906-NEXT:    s_add_u32 s8, s8, s3
+; GFX906-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX906-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX906-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
+; GFX906-NEXT:    s_addc_u32 s9, s9, 0
+; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX906-NEXT:    global_load_dwordx4 v[5:8], v4, s[4:5]
+; GFX906-NEXT:    global_load_dwordx4 v[9:12], v4, s[4:5] offset:16
+; GFX906-NEXT:    global_load_dwordx4 v[29:32], v4, s[4:5] offset:32
+; GFX906-NEXT:    global_load_dwordx4 v[13:16], v4, s[4:5] offset:48
+; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v0
+; GFX906-NEXT:    s_waitcnt vmcnt(3)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v5
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:780 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v5
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:692 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v6
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:768 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v6
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:712 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v6
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:716 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v7
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:772 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v7
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:736 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v7
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:740 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v8
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:776 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v8
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:744 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v8
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:748 ; 4-byte Folded Spill
+; GFX906-NEXT:    s_waitcnt vmcnt(13)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v9
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:696 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v9
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:564 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v9
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:568 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v10
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:700 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v10
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:572 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v10
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:576 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v11
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:704 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v11
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:588 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v11
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:592 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v12
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:708 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v12
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:596 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:16 ; 4-byte Folded Spill
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    buffer_store_dword v10, off, s[8:11], 0 offset:20 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v11, off, s[8:11], 0 offset:24 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v12, off, s[8:11], 0 offset:28 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v12
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:600 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v29
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:752 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v29
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:636 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v29
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:640 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v30
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:756 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v30
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:668 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v30
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:672 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v31
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:760 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v31
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:676 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v31
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:680 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v32
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:764 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v32
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:684 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v32
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:688 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v13
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:652 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v13
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:500 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v13
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:504 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v14
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:656 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v14
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:524 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v14
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:528 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v15
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:660 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v15
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:548 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v15
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:552 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v16
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:664 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v16
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:556 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v16
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:560 ; 4-byte Folded Spill
+; GFX906-NEXT:    global_load_dwordx4 v[0:3], v4, s[4:5] offset:64
+; GFX906-NEXT:    s_nop 0
+; GFX906-NEXT:    global_load_dwordx4 v[45:48], v4, s[4:5] offset:80
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v0
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:720 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v0
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:580 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v0
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:584 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v1
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:724 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:604 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v1
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:608 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v2
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:728 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v2
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:628 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v2
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:632 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v3
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:732 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v3
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:644 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v3
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:648 ; 4-byte Folded Spill
+; GFX906-NEXT:    s_waitcnt vmcnt(12)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v45
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:612 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v45
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:468 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v45
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:472 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v46
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:616 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v46
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:476 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v46
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:480 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v47
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:620 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v47
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:508 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v47
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:512 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v48
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:624 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v48
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:516 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v48
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:520 ; 4-byte Folded Spill
+; GFX906-NEXT:    global_load_dwordx4 v[57:60], v4, s[4:5] offset:96
+; GFX906-NEXT:    global_load_dwordx4 v[41:44], v4, s[4:5] offset:112
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v57
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:532 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v57
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:404 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v57
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:408 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v58
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:536 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v58
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:428 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v58
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:432 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v59
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:540 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v59
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:452 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v59
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:456 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v60
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:544 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v60
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:460 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v60
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:464 ; 4-byte Folded Spill
+; GFX906-NEXT:    s_waitcnt vmcnt(12)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v41
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:436 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v41
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:308 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v41
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:312 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v42
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:440 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v42
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:332 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v42
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:336 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v43
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:444 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v43
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:356 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v43
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:360 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v44
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:448 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v44
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:364 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v44
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:368 ; 4-byte Folded Spill
+; GFX906-NEXT:    global_load_dwordx4 v[53:56], v4, s[4:5] offset:128
+; GFX906-NEXT:    global_load_dwordx4 v[25:28], v4, s[4:5] offset:144
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v53
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:484 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v53
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:372 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v53
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:376 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v54
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:488 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v54
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:380 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v54
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:384 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v55
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:492 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v55
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:412 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v55
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:416 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v56
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:496 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v56
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:420 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v56
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:424 ; 4-byte Folded Spill
+; GFX906-NEXT:    s_waitcnt vmcnt(12)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v25
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:388 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v25
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:276 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v25
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:280 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v26
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:392 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v26
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:284 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v26
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:288 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v27
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:396 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v27
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:316 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v27
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:320 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v28
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:400 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v28
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:324 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v28
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:328 ; 4-byte Folded Spill
+; GFX906-NEXT:    global_load_dwordx4 v[49:52], v4, s[4:5] offset:160
+; GFX906-NEXT:    global_load_dwordx4 v[21:24], v4, s[4:5] offset:176
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v49
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:340 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v49
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:164 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v49
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:168 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v50
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:344 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v50
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:236 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v50
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:240 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v51
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:348 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v51
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:260 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v51
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:264 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v52
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:352 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v52
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:268 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v52
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:272 ; 4-byte Folded Spill
+; GFX906-NEXT:    s_waitcnt vmcnt(12)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v21
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:244 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v21
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:80 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v21
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:32 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v22
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:248 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v22
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:36 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v22
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:40 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v23
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:252 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v23
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:44 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v23
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:48 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v24
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:256 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v24
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:52 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v24
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:56 ; 4-byte Folded Spill
+; GFX906-NEXT:    global_load_dwordx4 v[33:36], v4, s[4:5] offset:192
+; GFX906-NEXT:    global_load_dwordx4 v[17:20], v4, s[4:5] offset:208
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v33
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:292 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v33
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:60 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v33
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:64 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v34
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:296 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v34
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:108 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v34
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:112 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v35
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:300 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v35
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:172 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v35
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:176 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v36
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:304 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v36
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:228 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v36
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:232 ; 4-byte Folded Spill
+; GFX906-NEXT:    s_waitcnt vmcnt(12)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v17
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:116 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v17
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:120 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v17
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:124 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v18
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:128 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v18
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:132 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v18
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:136 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v19
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:140 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v19
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:144 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v19
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:148 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v20
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:152 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v20
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:156 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v20
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:160 ; 4-byte Folded Spill
+; GFX906-NEXT:    global_load_dwordx4 v[37:40], v4, s[4:5] offset:224
+; GFX906-NEXT:    s_nop 0
+; GFX906-NEXT:    global_load_dwordx4 v[9:12], v4, s[4:5] offset:240
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 8, v37
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:180 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 16, v37
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:184 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 24, v37
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:188 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 8, v38
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:192 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 16, v38
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:196 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 24, v38
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:200 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 8, v39
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:204 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 16, v39
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:208 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 24, v39
+; GFX906-NEXT:    s_waitcnt vmcnt(8)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v62, 8, v9
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:212 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 8, v40
+; GFX906-NEXT:    buffer_store_dword v62, off, s[8:11], 0 offset:68 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v62, 8, v11
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:216 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 16, v40
+; GFX906-NEXT:    buffer_store_dword v62, off, s[8:11], 0 offset:84 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v62, 16, v11
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:220 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 24, v40
+; GFX906-NEXT:    buffer_store_dword v62, off, s[8:11], 0 offset:96 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v62, 24, v11
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:224 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 16, v9
+; GFX906-NEXT:    buffer_store_dword v62, off, s[8:11], 0 offset:92 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v62, 8, v12
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:88 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v63, 24, v9
+; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 16, v10
+; GFX906-NEXT:    buffer_store_dword v62, off, s[8:11], 0 offset:104 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v62, 16, v12
+; GFX906-NEXT:    buffer_store_dword v63, off, s[8:11], 0 offset:72 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v63, 8, v10
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:76 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 24, v10
+; GFX906-NEXT:    buffer_store_dword v62, off, s[8:11], 0 offset:100 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 ; 4-byte Folded Spill
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    buffer_store_dword v10, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v11, off, s[8:11], 0 offset:8 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v12, off, s[8:11], 0 offset:12 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v62, 16, v5
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v12
+; GFX906-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX906-NEXT:    s_cbranch_execz .LBB6_2
+; GFX906-NEXT:  ; %bb.1: ; %bb.1
+; GFX906-NEXT:    global_load_dwordx4 v[5:8], v4, s[6:7]
+; GFX906-NEXT:    global_load_dwordx4 v[0:3], v4, s[6:7] offset:16
+; GFX906-NEXT:    global_load_dwordx4 v[29:32], v4, s[6:7] offset:32
+; GFX906-NEXT:    global_load_dwordx4 v[13:16], v4, s[6:7] offset:48
+; GFX906-NEXT:    s_waitcnt vmcnt(3)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v5
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:780 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v5
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:692 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v6
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:768 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v6
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:712 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v6
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:716 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v7
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:772 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v7
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:736 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v7
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:740 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v8
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:776 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v8
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:744 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v8
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:748 ; 4-byte Folded Spill
+; GFX906-NEXT:    s_waitcnt vmcnt(13)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v0
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:696 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v0
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:564 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v0
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:568 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v1
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:700 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:572 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v1
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:576 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v2
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:704 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v2
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:588 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v2
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:592 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v3
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:708 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v3
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:596 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:16 ; 4-byte Folded Spill
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    buffer_store_dword v1, off, s[8:11], 0 offset:20 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:24 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:28 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v62, 16, v5
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v3
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:600 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v29
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:752 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v29
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:636 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v29
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:640 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v30
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:756 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v30
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:668 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v30
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:672 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v31
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:760 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v31
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:676 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v31
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:680 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v32
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:764 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v32
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:684 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v32
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:688 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v13
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:652 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v13
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:500 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v13
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:504 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v14
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:656 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v14
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:524 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v14
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:528 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v15
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:660 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v15
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:548 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v15
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:552 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v16
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:664 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v16
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:556 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v16
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:560 ; 4-byte Folded Spill
+; GFX906-NEXT:    global_load_dwordx4 v[0:3], v4, s[6:7] offset:64
+; GFX906-NEXT:    s_nop 0
+; GFX906-NEXT:    global_load_dwordx4 v[45:48], v4, s[6:7] offset:80
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v0
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:720 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v0
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:580 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v0
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:584 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v1
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:724 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:604 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v1
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:608 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v2
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:728 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v2
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:628 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v2
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:632 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v3
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:732 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v3
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:644 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v3
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:648 ; 4-byte Folded Spill
+; GFX906-NEXT:    s_waitcnt vmcnt(12)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v45
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:612 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v45
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:468 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v45
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:472 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v46
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:616 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v46
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:476 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v46
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:480 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v47
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:620 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v47
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:508 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v47
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:512 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v48
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:624 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v48
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:516 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v48
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:520 ; 4-byte Folded Spill
+; GFX906-NEXT:    global_load_dwordx4 v[57:60], v4, s[6:7] offset:96
+; GFX906-NEXT:    global_load_dwordx4 v[41:44], v4, s[6:7] offset:112
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v57
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:532 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v57
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:404 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v57
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:408 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v58
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:536 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v58
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:428 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v58
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:432 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v59
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:540 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v59
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:452 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v59
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:456 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v60
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:544 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v60
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:460 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v60
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:464 ; 4-byte Folded Spill
+; GFX906-NEXT:    s_waitcnt vmcnt(12)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v41
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:436 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v41
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:308 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v41
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:312 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v42
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:440 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v42
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:332 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v42
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:336 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v43
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:444 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v43
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:356 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v43
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:360 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v44
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:448 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v44
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:364 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v44
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:368 ; 4-byte Folded Spill
+; GFX906-NEXT:    global_load_dwordx4 v[53:56], v4, s[6:7] offset:128
+; GFX906-NEXT:    global_load_dwordx4 v[25:28], v4, s[6:7] offset:144
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v53
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:484 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v53
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:372 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v53
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:376 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v54
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:488 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v54
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:380 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v54
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:384 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v55
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:492 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v55
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:412 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v55
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:416 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v56
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:496 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v56
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:420 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v56
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:424 ; 4-byte Folded Spill
+; GFX906-NEXT:    s_waitcnt vmcnt(12)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v25
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:388 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v25
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:276 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v25
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:280 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v26
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:392 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v26
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:284 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v26
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:288 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v27
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:396 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v27
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:316 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v27
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:320 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v28
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:400 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v28
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:324 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v28
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:328 ; 4-byte Folded Spill
+; GFX906-NEXT:    global_load_dwordx4 v[49:52], v4, s[6:7] offset:160
+; GFX906-NEXT:    global_load_dwordx4 v[21:24], v4, s[6:7] offset:176
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v49
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:340 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v49
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:164 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v49
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:168 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v50
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:344 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v50
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:236 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v50
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:240 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v51
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:348 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v51
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:260 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v51
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:264 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v52
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:352 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v52
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:268 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v52
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:272 ; 4-byte Folded Spill
+; GFX906-NEXT:    s_waitcnt vmcnt(12)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v21
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:244 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v21
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:80 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v21
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:32 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v22
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:248 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v22
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:36 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v22
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:40 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v23
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:252 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v23
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:44 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v23
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:48 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v24
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:256 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v24
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:52 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v24
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:56 ; 4-byte Folded Spill
+; GFX906-NEXT:    global_load_dwordx4 v[33:36], v4, s[6:7] offset:192
+; GFX906-NEXT:    global_load_dwordx4 v[17:20], v4, s[6:7] offset:208
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v33
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:292 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v33
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:60 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v33
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:64 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v34
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:296 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v34
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:108 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v34
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:112 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v35
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:300 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v35
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:172 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v35
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:176 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v36
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:304 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v36
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:228 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v36
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:232 ; 4-byte Folded Spill
+; GFX906-NEXT:    s_waitcnt vmcnt(12)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v17
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:116 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v17
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:120 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v17
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:124 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v18
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:128 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v18
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:132 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v18
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:136 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v19
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:140 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v19
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:144 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v19
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:148 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v20
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:152 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v20
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:156 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v20
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:160 ; 4-byte Folded Spill
+; GFX906-NEXT:    global_load_dwordx4 v[37:40], v4, s[6:7] offset:224
+; GFX906-NEXT:    s_nop 0
+; GFX906-NEXT:    global_load_dwordx4 v[9:12], v4, s[6:7] offset:240
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 8, v37
+; GFX906-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:180 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 16, v37
+; GFX906-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:184 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 24, v37
+; GFX906-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:188 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 8, v38
+; GFX906-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:192 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 16, v38
+; GFX906-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:196 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 24, v38
+; GFX906-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:200 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 8, v39
+; GFX906-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:204 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 16, v39
+; GFX906-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:208 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 24, v39
+; GFX906-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:212 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 8, v40
+; GFX906-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:216 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 16, v40
+; GFX906-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:220 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 24, v40
+; GFX906-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:224 ; 4-byte Folded Spill
+; GFX906-NEXT:    s_waitcnt vmcnt(12)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 8, v9
+; GFX906-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:68 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 16, v9
+; GFX906-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:88 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 24, v9
+; GFX906-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:72 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 16, v10
+; GFX906-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:76 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 8, v11
+; GFX906-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:84 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 16, v11
+; GFX906-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:96 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 24, v11
+; GFX906-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:92 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 8, v12
+; GFX906-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:104 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 16, v12
+; GFX906-NEXT:    v_lshrrev_b32_e32 v63, 8, v10
+; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 24, v10
+; GFX906-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:100 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 ; 4-byte Folded Spill
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    buffer_store_dword v10, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v11, off, s[8:11], 0 offset:8 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v12, off, s[8:11], 0 offset:12 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v12
+; GFX906-NEXT:  .LBB6_2: ; %bb.2
+; GFX906-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:784 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v63, off, s[8:11], 0 offset:788 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:792 ; 4-byte Folded Spill
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    buffer_store_dword v18, off, s[8:11], 0 offset:796 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v19, off, s[8:11], 0 offset:800 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v20, off, s[8:11], 0 offset:804 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_load_dword v10, off, s[8:11], 0 offset:780 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_mov_b32_e32 v4, 8
+; GFX906-NEXT:    v_mov_b32_e32 v63, 0xff
+; GFX906-NEXT:    v_mov_b32_e32 v18, v16
+; GFX906-NEXT:    v_mov_b32_e32 v17, v15
+; GFX906-NEXT:    v_mov_b32_e32 v16, v14
+; GFX906-NEXT:    v_mov_b32_e32 v15, v13
+; GFX906-NEXT:    v_mov_b32_e32 v19, v9
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v61, v4, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    buffer_load_dword v10, off, s[8:11], 0 offset:768 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_and_or_b32 v5, v5, v63, v61
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v61, v4, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    buffer_load_dword v10, off, s[8:11], 0 offset:772 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_and_or_b32 v6, v6, v63, v61
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v61, v4, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    buffer_load_dword v10, off, s[8:11], 0 offset:776 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_and_or_b32 v7, v7, v63, v61
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v61, v4, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    buffer_load_dword v10, off, s[8:11], 0 offset:692 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v11, off, s[8:11], 0 offset:16 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v12, off, s[8:11], 0 offset:20 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v13, off, s[8:11], 0 offset:24 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v14, off, s[8:11], 0 offset:28 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v9, off, s[8:11], 0 offset:712 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_and_or_b32 v8, v8, v63, v61
+; GFX906-NEXT:    v_and_b32_e32 v61, 0xff, v62
+; GFX906-NEXT:    v_lshlrev_b32_e32 v61, 16, v61
+; GFX906-NEXT:    s_waitcnt vmcnt(5)
+; GFX906-NEXT:    v_lshlrev_b32_e32 v62, 24, v10
+; GFX906-NEXT:    buffer_load_dword v10, off, s[8:11], 0 offset:716 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_and_b32_e32 v9, 0xff, v9
+; GFX906-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX906-NEXT:    v_or3_b32 v5, v5, v61, v62
+; GFX906-NEXT:    v_mov_b32_e32 v61, 0
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshlrev_b32_e32 v10, 24, v10
+; GFX906-NEXT:    v_or3_b32 v6, v6, v9, v10
+; GFX906-NEXT:    buffer_load_dword v9, off, s[8:11], 0 offset:736 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v10, off, s[8:11], 0 offset:740 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_and_b32_e32 v9, 0xff, v9
+; GFX906-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshlrev_b32_e32 v10, 24, v10
+; GFX906-NEXT:    v_or3_b32 v7, v7, v9, v10
+; GFX906-NEXT:    buffer_load_dword v9, off, s[8:11], 0 offset:744 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v10, off, s[8:11], 0 offset:748 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_and_b32_e32 v9, 0xff, v9
+; GFX906-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshlrev_b32_e32 v10, 24, v10
+; GFX906-NEXT:    v_or3_b32 v8, v8, v9, v10
+; GFX906-NEXT:    global_store_dwordx4 v61, v[5:8], s[2:3]
+; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:696 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_nop 0
+; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:700 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v9, off, s[8:11], 0 offset:564 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v10, off, s[8:11], 0 offset:568 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v7, off, s[8:11], 0 offset:704 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v8, off, s[8:11], 0 offset:708 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(5)
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v5, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    v_and_or_b32 v5, v11, v63, v5
+; GFX906-NEXT:    s_waitcnt vmcnt(4)
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v6, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    v_and_or_b32 v6, v12, v63, v6
+; GFX906-NEXT:    buffer_load_dword v11, off, s[8:11], 0 offset:572 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v12, off, s[8:11], 0 offset:576 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(5)
+; GFX906-NEXT:    v_and_b32_e32 v9, 0xff, v9
+; GFX906-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX906-NEXT:    s_waitcnt vmcnt(4)
+; GFX906-NEXT:    v_lshlrev_b32_e32 v10, 24, v10
+; GFX906-NEXT:    v_or3_b32 v5, v5, v9, v10
+; GFX906-NEXT:    buffer_load_dword v9, off, s[8:11], 0 offset:588 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v10, off, s[8:11], 0 offset:592 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(5)
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v7, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    v_and_or_b32 v7, v13, v63, v7
+; GFX906-NEXT:    s_waitcnt vmcnt(3)
+; GFX906-NEXT:    v_and_b32_e32 v11, 0xff, v11
+; GFX906-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX906-NEXT:    s_waitcnt vmcnt(2)
+; GFX906-NEXT:    v_lshlrev_b32_e32 v12, 24, v12
+; GFX906-NEXT:    v_or3_b32 v6, v6, v11, v12
+; GFX906-NEXT:    buffer_load_dword v11, off, s[8:11], 0 offset:668 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v8, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    s_waitcnt vmcnt(2)
+; GFX906-NEXT:    v_and_b32_e32 v9, 0xff, v9
+; GFX906-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshlrev_b32_e32 v10, 24, v10
+; GFX906-NEXT:    v_or3_b32 v7, v7, v9, v10
+; GFX906-NEXT:    buffer_load_dword v9, off, s[8:11], 0 offset:596 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v10, off, s[8:11], 0 offset:600 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v12, off, s[8:11], 0 offset:672 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_and_or_b32 v8, v14, v63, v8
+; GFX906-NEXT:    s_waitcnt vmcnt(3)
+; GFX906-NEXT:    v_and_b32_e32 v11, 0xff, v11
+; GFX906-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX906-NEXT:    s_waitcnt vmcnt(2)
+; GFX906-NEXT:    v_and_b32_e32 v9, 0xff, v9
+; GFX906-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshlrev_b32_e32 v10, 24, v10
+; GFX906-NEXT:    v_or3_b32 v8, v8, v9, v10
+; GFX906-NEXT:    global_store_dwordx4 v61, v[5:8], s[2:3] offset:16
+; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:752 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(2)
+; GFX906-NEXT:    v_lshlrev_b32_e32 v12, 24, v12
+; GFX906-NEXT:    buffer_load_dword v7, off, s[8:11], 0 offset:760 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v9, off, s[8:11], 0 offset:636 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v10, off, s[8:11], 0 offset:640 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:756 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v8, off, s[8:11], 0 offset:764 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(5)
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v5, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    v_and_or_b32 v5, v29, v63, v5
+; GFX906-NEXT:    s_waitcnt vmcnt(4)
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v7, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    s_waitcnt vmcnt(3)
+; GFX906-NEXT:    v_and_b32_e32 v9, 0xff, v9
+; GFX906-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX906-NEXT:    s_waitcnt vmcnt(2)
+; GFX906-NEXT:    v_lshlrev_b32_e32 v10, 24, v10
+; GFX906-NEXT:    v_or3_b32 v5, v5, v9, v10
+; GFX906-NEXT:    buffer_load_dword v9, off, s[8:11], 0 offset:676 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v10, off, s[8:11], 0 offset:680 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_and_or_b32 v7, v31, v63, v7
+; GFX906-NEXT:    s_waitcnt vmcnt(3)
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v6, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    v_and_or_b32 v6, v30, v63, v6
+; GFX906-NEXT:    v_or3_b32 v6, v6, v11, v12
+; GFX906-NEXT:    buffer_load_dword v11, off, s[8:11], 0 offset:524 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(2)
+; GFX906-NEXT:    v_and_b32_e32 v9, 0xff, v9
+; GFX906-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshlrev_b32_e32 v10, 24, v10
+; GFX906-NEXT:    v_or3_b32 v7, v7, v9, v10
+; GFX906-NEXT:    buffer_load_dword v9, off, s[8:11], 0 offset:684 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v10, off, s[8:11], 0 offset:688 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v8, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    v_and_or_b32 v8, v32, v63, v8
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_and_b32_e32 v9, 0xff, v9
+; GFX906-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshlrev_b32_e32 v10, 24, v10
+; GFX906-NEXT:    v_or3_b32 v8, v8, v9, v10
+; GFX906-NEXT:    global_store_dwordx4 v61, v[5:8], s[2:3] offset:32
+; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:652 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_nop 0
+; GFX906-NEXT:    buffer_load_dword v7, off, s[8:11], 0 offset:660 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v9, off, s[8:11], 0 offset:500 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v10, off, s[8:11], 0 offset:504 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v12, off, s[8:11], 0 offset:528 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:656 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v8, off, s[8:11], 0 offset:664 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_and_b32_e32 v11, 0xff, v11
+; GFX906-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX906-NEXT:    s_waitcnt vmcnt(6)
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v5, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    v_and_or_b32 v5, v15, v63, v5
+; GFX906-NEXT:    s_waitcnt vmcnt(5)
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v7, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    s_waitcnt vmcnt(4)
+; GFX906-NEXT:    v_and_b32_e32 v9, 0xff, v9
+; GFX906-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX906-NEXT:    s_waitcnt vmcnt(3)
+; GFX906-NEXT:    v_lshlrev_b32_e32 v10, 24, v10
+; GFX906-NEXT:    v_or3_b32 v5, v5, v9, v10
+; GFX906-NEXT:    buffer_load_dword v9, off, s[8:11], 0 offset:548 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v10, off, s[8:11], 0 offset:552 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_and_or_b32 v7, v17, v63, v7
+; GFX906-NEXT:    s_waitcnt vmcnt(3)
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v6, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    v_and_or_b32 v6, v16, v63, v6
+; GFX906-NEXT:    v_lshlrev_b32_e32 v12, 24, v12
+; GFX906-NEXT:    v_or3_b32 v6, v6, v11, v12
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_and_b32_e32 v9, 0xff, v9
+; GFX906-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshlrev_b32_e32 v10, 24, v10
+; GFX906-NEXT:    v_or3_b32 v7, v7, v9, v10
+; GFX906-NEXT:    buffer_load_dword v9, off, s[8:11], 0 offset:556 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v10, off, s[8:11], 0 offset:560 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v8, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    v_and_or_b32 v8, v18, v63, v8
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_and_b32_e32 v9, 0xff, v9
+; GFX906-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshlrev_b32_e32 v10, 24, v10
+; GFX906-NEXT:    v_or3_b32 v8, v8, v9, v10
+; GFX906-NEXT:    global_store_dwordx4 v61, v[5:8], s[2:3] offset:48
+; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:720 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v5, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    v_and_or_b32 v0, v0, v63, v5
+; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:724 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:584 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v7, off, s[8:11], 0 offset:604 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v8, off, s[8:11], 0 offset:608 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(3)
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v5, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    v_and_or_b32 v1, v1, v63, v5
+; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:728 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(3)
+; GFX906-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
+; GFX906-NEXT:    s_waitcnt vmcnt(2)
+; GFX906-NEXT:    v_and_b32_e32 v7, 0xff, v7
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshlrev_b32_e32 v8, 24, v8
+; GFX906-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX906-NEXT:    v_or3_b32 v1, v1, v7, v8
+; GFX906-NEXT:    buffer_load_dword v7, off, s[8:11], 0 offset:476 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v5, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    v_and_or_b32 v2, v2, v63, v5
+; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:732 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v5, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    v_and_or_b32 v3, v3, v63, v5
+; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:580 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX906-NEXT:    v_or3_b32 v0, v0, v5, v6
+; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:628 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:632 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
+; GFX906-NEXT:    v_or3_b32 v2, v2, v5, v6
+; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:644 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:648 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
+; GFX906-NEXT:    v_or3_b32 v3, v3, v5, v6
+; GFX906-NEXT:    global_store_dwordx4 v61, v[0:3], s[2:3] offset:64
+; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:612 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_nop 0
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:620 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:468 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:472 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v8, off, s[8:11], 0 offset:480 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:616 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:624 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_and_b32_e32 v7, 0xff, v7
+; GFX906-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX906-NEXT:    s_waitcnt vmcnt(6)
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    v_and_or_b32 v0, v45, v63, v0
+; GFX906-NEXT:    s_waitcnt vmcnt(5)
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    s_waitcnt vmcnt(4)
+; GFX906-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX906-NEXT:    s_waitcnt vmcnt(3)
+; GFX906-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
+; GFX906-NEXT:    v_or3_b32 v0, v0, v5, v6
+; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:508 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:512 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_and_or_b32 v2, v47, v63, v2
+; GFX906-NEXT:    s_waitcnt vmcnt(3)
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    v_and_or_b32 v1, v46, v63, v1
+; GFX906-NEXT:    v_lshlrev_b32_e32 v8, 24, v8
+; GFX906-NEXT:    v_or3_b32 v1, v1, v7, v8
+; GFX906-NEXT:    buffer_load_dword v7, off, s[8:11], 0 offset:428 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(2)
+; GFX906-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
+; GFX906-NEXT:    v_or3_b32 v2, v2, v5, v6
+; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:516 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:520 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    v_and_or_b32 v3, v48, v63, v3
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
+; GFX906-NEXT:    v_or3_b32 v3, v3, v5, v6
+; GFX906-NEXT:    global_store_dwordx4 v61, v[0:3], s[2:3] offset:80
+; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:532 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_nop 0
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:540 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:404 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:408 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v8, off, s[8:11], 0 offset:432 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:536 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:544 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_and_b32_e32 v7, 0xff, v7
+; GFX906-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX906-NEXT:    s_waitcnt vmcnt(6)
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    v_and_or_b32 v0, v57, v63, v0
+; GFX906-NEXT:    s_waitcnt vmcnt(5)
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    s_waitcnt vmcnt(4)
+; GFX906-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX906-NEXT:    s_waitcnt vmcnt(3)
+; GFX906-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
+; GFX906-NEXT:    v_or3_b32 v0, v0, v5, v6
+; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:452 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:456 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_and_or_b32 v2, v59, v63, v2
+; GFX906-NEXT:    s_waitcnt vmcnt(3)
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    v_and_or_b32 v1, v58, v63, v1
+; GFX906-NEXT:    v_lshlrev_b32_e32 v8, 24, v8
+; GFX906-NEXT:    v_or3_b32 v1, v1, v7, v8
+; GFX906-NEXT:    buffer_load_dword v7, off, s[8:11], 0 offset:332 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(2)
+; GFX906-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
+; GFX906-NEXT:    v_or3_b32 v2, v2, v5, v6
+; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:460 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:464 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    v_and_or_b32 v3, v60, v63, v3
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
+; GFX906-NEXT:    v_or3_b32 v3, v3, v5, v6
+; GFX906-NEXT:    global_store_dwordx4 v61, v[0:3], s[2:3] offset:96
+; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:436 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_nop 0
+; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:440 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:308 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:312 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v8, off, s[8:11], 0 offset:336 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:444 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:448 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_and_b32_e32 v7, 0xff, v7
+; GFX906-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX906-NEXT:    s_waitcnt vmcnt(6)
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    v_and_or_b32 v0, v41, v63, v0
+; GFX906-NEXT:    s_waitcnt vmcnt(5)
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    s_waitcnt vmcnt(4)
+; GFX906-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX906-NEXT:    s_waitcnt vmcnt(3)
+; GFX906-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
+; GFX906-NEXT:    v_or3_b32 v0, v0, v5, v6
+; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:356 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:360 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_and_or_b32 v1, v42, v63, v1
+; GFX906-NEXT:    s_waitcnt vmcnt(4)
+; GFX906-NEXT:    v_lshlrev_b32_e32 v8, 24, v8
+; GFX906-NEXT:    v_or3_b32 v1, v1, v7, v8
+; GFX906-NEXT:    buffer_load_dword v7, off, s[8:11], 0 offset:380 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(4)
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    v_and_or_b32 v2, v43, v63, v2
+; GFX906-NEXT:    buffer_load_dword v8, off, s[8:11], 0 offset:384 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(4)
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    v_and_or_b32 v3, v44, v63, v3
+; GFX906-NEXT:    s_waitcnt vmcnt(3)
+; GFX906-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX906-NEXT:    s_waitcnt vmcnt(2)
+; GFX906-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
+; GFX906-NEXT:    v_or3_b32 v2, v2, v5, v6
+; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:364 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:368 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(3)
+; GFX906-NEXT:    v_and_b32_e32 v7, 0xff, v7
+; GFX906-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX906-NEXT:    s_waitcnt vmcnt(2)
+; GFX906-NEXT:    v_lshlrev_b32_e32 v8, 24, v8
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
+; GFX906-NEXT:    v_or3_b32 v3, v3, v5, v6
+; GFX906-NEXT:    global_store_dwordx4 v61, v[0:3], s[2:3] offset:112
+; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:484 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_nop 0
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:492 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:372 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:376 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:488 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:496 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(5)
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    v_and_or_b32 v0, v53, v63, v0
+; GFX906-NEXT:    s_waitcnt vmcnt(4)
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    s_waitcnt vmcnt(3)
+; GFX906-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX906-NEXT:    s_waitcnt vmcnt(2)
+; GFX906-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
+; GFX906-NEXT:    v_or3_b32 v0, v0, v5, v6
+; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:412 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:416 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_and_or_b32 v2, v55, v63, v2
+; GFX906-NEXT:    s_waitcnt vmcnt(3)
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    v_and_or_b32 v1, v54, v63, v1
+; GFX906-NEXT:    v_or3_b32 v1, v1, v7, v8
+; GFX906-NEXT:    buffer_load_dword v7, off, s[8:11], 0 offset:284 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(2)
+; GFX906-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
+; GFX906-NEXT:    v_or3_b32 v2, v2, v5, v6
+; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:420 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:424 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    v_and_or_b32 v3, v56, v63, v3
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
+; GFX906-NEXT:    v_or3_b32 v3, v3, v5, v6
+; GFX906-NEXT:    global_store_dwordx4 v61, v[0:3], s[2:3] offset:128
+; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:388 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_nop 0
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:396 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:276 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:280 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v8, off, s[8:11], 0 offset:288 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:392 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:400 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_and_b32_e32 v7, 0xff, v7
+; GFX906-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX906-NEXT:    s_waitcnt vmcnt(6)
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    v_and_or_b32 v0, v25, v63, v0
+; GFX906-NEXT:    s_waitcnt vmcnt(5)
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    s_waitcnt vmcnt(4)
+; GFX906-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX906-NEXT:    s_waitcnt vmcnt(3)
+; GFX906-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
+; GFX906-NEXT:    v_or3_b32 v0, v0, v5, v6
+; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:316 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:320 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_and_or_b32 v2, v27, v63, v2
+; GFX906-NEXT:    s_waitcnt vmcnt(3)
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    v_and_or_b32 v1, v26, v63, v1
+; GFX906-NEXT:    v_lshlrev_b32_e32 v8, 24, v8
+; GFX906-NEXT:    v_or3_b32 v1, v1, v7, v8
+; GFX906-NEXT:    buffer_load_dword v7, off, s[8:11], 0 offset:236 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(2)
+; GFX906-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
+; GFX906-NEXT:    v_or3_b32 v2, v2, v5, v6
+; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:324 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:328 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    v_and_or_b32 v3, v28, v63, v3
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
+; GFX906-NEXT:    v_or3_b32 v3, v3, v5, v6
+; GFX906-NEXT:    global_store_dwordx4 v61, v[0:3], s[2:3] offset:144
+; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:340 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_nop 0
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:348 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:164 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:168 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v8, off, s[8:11], 0 offset:240 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:344 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:352 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(6)
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    v_and_or_b32 v0, v49, v63, v0
+; GFX906-NEXT:    s_waitcnt vmcnt(5)
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    s_waitcnt vmcnt(4)
+; GFX906-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX906-NEXT:    s_waitcnt vmcnt(3)
+; GFX906-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
+; GFX906-NEXT:    v_or3_b32 v0, v0, v5, v6
+; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:260 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:264 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_and_or_b32 v2, v51, v63, v2
+; GFX906-NEXT:    s_waitcnt vmcnt(3)
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    v_and_or_b32 v1, v50, v63, v1
+; GFX906-NEXT:    v_lshlrev_b32_e32 v8, 24, v8
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
+; GFX906-NEXT:    v_or3_b32 v2, v2, v5, v6
+; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:268 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:272 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    v_and_or_b32 v3, v52, v63, v3
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
+; GFX906-NEXT:    v_or3_b32 v3, v3, v5, v6
+; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:80 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_and_b32_e32 v7, 0xff, v7
+; GFX906-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX906-NEXT:    v_or3_b32 v1, v1, v7, v8
+; GFX906-NEXT:    buffer_load_dword v8, off, s[8:11], 0 offset:40 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:32 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v7, off, s[8:11], 0 offset:36 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(3)
+; GFX906-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX906-NEXT:    global_store_dwordx4 v61, v[0:3], s[2:3] offset:160
+; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:244 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:248 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(4)
+; GFX906-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:252 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(4)
+; GFX906-NEXT:    v_and_b32_e32 v7, 0xff, v7
+; GFX906-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX906-NEXT:    v_lshlrev_b32_e32 v8, 24, v8
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:256 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(3)
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    v_and_or_b32 v0, v21, v63, v0
+; GFX906-NEXT:    v_or3_b32 v0, v0, v5, v6
+; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:44 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:48 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(4)
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    v_and_or_b32 v1, v22, v63, v1
+; GFX906-NEXT:    v_or3_b32 v1, v1, v7, v8
+; GFX906-NEXT:    buffer_load_dword v7, off, s[8:11], 0 offset:108 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(4)
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    v_and_or_b32 v2, v23, v63, v2
+; GFX906-NEXT:    buffer_load_dword v8, off, s[8:11], 0 offset:112 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(4)
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    v_and_or_b32 v3, v24, v63, v3
+; GFX906-NEXT:    s_waitcnt vmcnt(3)
+; GFX906-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX906-NEXT:    s_waitcnt vmcnt(2)
+; GFX906-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
+; GFX906-NEXT:    v_or3_b32 v2, v2, v5, v6
+; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:52 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:56 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(3)
+; GFX906-NEXT:    v_and_b32_e32 v7, 0xff, v7
+; GFX906-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX906-NEXT:    s_waitcnt vmcnt(2)
+; GFX906-NEXT:    v_lshlrev_b32_e32 v8, 24, v8
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
+; GFX906-NEXT:    v_or3_b32 v3, v3, v5, v6
+; GFX906-NEXT:    global_store_dwordx4 v61, v[0:3], s[2:3] offset:176
+; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:292 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_nop 0
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:300 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:60 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:64 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:296 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:304 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(5)
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    v_and_or_b32 v0, v33, v63, v0
+; GFX906-NEXT:    s_waitcnt vmcnt(4)
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    s_waitcnt vmcnt(3)
+; GFX906-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX906-NEXT:    s_waitcnt vmcnt(2)
+; GFX906-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
+; GFX906-NEXT:    v_or3_b32 v0, v0, v5, v6
+; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:172 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:176 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_and_or_b32 v2, v35, v63, v2
+; GFX906-NEXT:    s_waitcnt vmcnt(3)
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    v_and_or_b32 v1, v34, v63, v1
+; GFX906-NEXT:    v_or3_b32 v1, v1, v7, v8
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
+; GFX906-NEXT:    v_or3_b32 v2, v2, v5, v6
+; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:228 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:232 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    v_and_or_b32 v3, v36, v63, v3
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
+; GFX906-NEXT:    v_or3_b32 v3, v3, v5, v6
+; GFX906-NEXT:    global_store_dwordx4 v61, v[0:3], s[2:3] offset:192
+; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:116 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_nop 0
+; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:792 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:796 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v7, off, s[8:11], 0 offset:800 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v8, off, s[8:11], 0 offset:804 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:120 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:124 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:136 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(7)
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    s_waitcnt vmcnt(3)
+; GFX906-NEXT:    v_and_or_b32 v0, v5, v63, v0
+; GFX906-NEXT:    s_waitcnt vmcnt(2)
+; GFX906-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX906-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
+; GFX906-NEXT:    v_or3_b32 v0, v0, v1, v2
+; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:128 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:132 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:148 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(3)
+; GFX906-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
+; GFX906-NEXT:    s_waitcnt vmcnt(2)
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX906-NEXT:    v_and_or_b32 v1, v6, v63, v1
+; GFX906-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX906-NEXT:    v_or3_b32 v1, v1, v2, v3
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:140 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:144 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:160 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(3)
+; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
+; GFX906-NEXT:    s_waitcnt vmcnt(2)
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX906-NEXT:    v_and_or_b32 v2, v7, v63, v2
+; GFX906-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX906-NEXT:    v_or3_b32 v2, v2, v3, v5
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:152 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:156 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(2)
+; GFX906-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX906-NEXT:    v_and_or_b32 v3, v8, v63, v3
+; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX906-NEXT:    v_or3_b32 v3, v3, v5, v6
+; GFX906-NEXT:    global_store_dwordx4 v61, v[0:3], s[2:3] offset:208
+; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:180 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_nop 0
+; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:184 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:188 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:200 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:224 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:212 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(5)
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    v_and_or_b32 v0, v37, v63, v0
+; GFX906-NEXT:    s_waitcnt vmcnt(4)
+; GFX906-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX906-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX906-NEXT:    s_waitcnt vmcnt(3)
+; GFX906-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
+; GFX906-NEXT:    v_or3_b32 v0, v0, v1, v2
+; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:192 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:196 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(4)
+; GFX906-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
+; GFX906-NEXT:    s_waitcnt vmcnt(2)
+; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
+; GFX906-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX906-NEXT:    v_and_or_b32 v1, v38, v63, v1
+; GFX906-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX906-NEXT:    v_or3_b32 v1, v1, v2, v3
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:204 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:208 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX906-NEXT:    v_and_or_b32 v2, v39, v63, v2
+; GFX906-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX906-NEXT:    v_or3_b32 v2, v2, v3, v5
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:216 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:220 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX906-NEXT:    v_and_or_b32 v3, v40, v63, v3
+; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX906-NEXT:    v_or3_b32 v3, v3, v5, v6
+; GFX906-NEXT:    global_store_dwordx4 v61, v[0:3], s[2:3] offset:224
+; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:68 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_nop 0
+; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:4 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v7, off, s[8:11], 0 offset:8 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v8, off, s[8:11], 0 offset:12 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:88 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:784 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:72 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(7)
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    s_waitcnt vmcnt(3)
+; GFX906-NEXT:    v_and_or_b32 v0, v5, v63, v0
+; GFX906-NEXT:    s_waitcnt vmcnt(2)
+; GFX906-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX906-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
+; GFX906-NEXT:    v_or3_b32 v0, v0, v1, v2
+; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:788 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:92 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:76 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
+; GFX906-NEXT:    s_waitcnt vmcnt(2)
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    v_and_or_b32 v1, v6, v63, v1
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX906-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX906-NEXT:    v_or3_b32 v1, v1, v2, v3
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:84 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:96 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX906-NEXT:    v_and_or_b32 v2, v7, v63, v2
+; GFX906-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX906-NEXT:    v_or3_b32 v2, v2, v3, v5
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:104 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 24, v19
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    buffer_load_dword v4, off, s[8:11], 0 offset:100 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_and_or_b32 v3, v8, v63, v3
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX906-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX906-NEXT:    v_or3_b32 v3, v3, v4, v5
+; GFX906-NEXT:    global_store_dwordx4 v61, v[0:3], s[2:3] offset:240
+; GFX906-NEXT:    s_endpgm
+entry:
+  %idx = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx
+  %vec1 = load <256 x i8>, ptr addrspace(1) %gep1
+  %gep2 = getelementptr <8 x i8>, ptr addrspace(1) %src2, i32 %idx
+  %vec2 = load <256 x i8>, ptr addrspace(1) %gep2
+  %cmp = icmp ult i32 %idx, 15
+  br i1 %cmp, label %bb.1, label %bb.2
+bb.1:
+  br label %bb.2
+
+bb.2:
+  %tmp5 = phi <256 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ]
+  store <256 x i8> %tmp5, ptr addrspace(1) %dst, align 4
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x()
+
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-loop-carried.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-loop-carried.ll
new file mode 100644
index 00000000000000..95c541e2e60b7f
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-loop-carried.ll
@@ -0,0 +1,67 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx906 < %s | FileCheck --check-prefix=GFX906 %s
+
+define amdgpu_kernel void @v32i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
+; GFX906-LABEL: v32i8_liveout:
+; GFX906:       ; %bb.0: ; %entry
+; GFX906-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX906-NEXT:    v_lshlrev_b32_e32 v1, 5, v0
+; GFX906-NEXT:    v_cmp_le_u32_e32 vcc, 15, v0
+; GFX906-NEXT:    v_mov_b32_e32 v3, 0xff
+; GFX906-NEXT:    v_mov_b32_e32 v4, 8
+; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX906-NEXT:    global_load_dword v1, v1, s[2:3]
+; GFX906-NEXT:    s_mov_b64 s[2:3], 0
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
+; GFX906-NEXT:    v_lshrrev_b32_e32 v6, 8, v1
+; GFX906-NEXT:    v_lshrrev_b32_e32 v2, 24, v1
+; GFX906-NEXT:    v_mov_b32_e32 v5, v0
+; GFX906-NEXT:  .LBB0_1: ; %bb.1
+; GFX906-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v6, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX906-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX906-NEXT:    v_and_or_b32 v6, v1, v3, v6
+; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX906-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
+; GFX906-NEXT:    s_and_b64 s[4:5], exec, vcc
+; GFX906-NEXT:    v_or3_b32 v5, v6, v5, v2
+; GFX906-NEXT:    s_or_b64 s[2:3], s[4:5], s[2:3]
+; GFX906-NEXT:    v_lshrrev_b32_e32 v2, 16, v5
+; GFX906-NEXT:    v_mov_b32_e32 v6, v0
+; GFX906-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX906-NEXT:    s_cbranch_execnz .LBB0_1
+; GFX906-NEXT:  ; %bb.2: ; %bb.2.loopexit
+; GFX906-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX906-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX906-NEXT:    v_mov_b32_e32 v4, 8
+; GFX906-NEXT:    v_mov_b32_e32 v3, 0xff
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    v_and_or_b32 v0, v1, v3, v0
+; GFX906-NEXT:    v_and_b32_e32 v1, 0xff, v5
+; GFX906-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX906-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX906-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
+; GFX906-NEXT:    v_or3_b32 v0, v0, v1, v2
+; GFX906-NEXT:    v_mov_b32_e32 v1, 0
+; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX906-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX906-NEXT:    s_endpgm
+entry:
+  %idx = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep1 = getelementptr <32 x i8>, ptr addrspace(1) %src1, i32 %idx
+  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
+  br label %bb.1
+
+bb.1:
+  %temp = phi <4 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ]
+  %vec2 = shufflevector <4 x i8> %vec1, <4 x i8> %temp, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %cmp = icmp ult i32 %idx, 15
+  br i1 %cmp, label %bb.1, label %bb.2
+  br label %bb.2
+
+bb.2:
+  store <4 x i8> %vec2, ptr addrspace(1) %dst, align 4
+  ret void
+}

>From 95bd7877c4fa0cac3cb1407ab7329227f1511293 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Fri, 19 Apr 2024 10:58:25 -0700
Subject: [PATCH 09/11] Enable for GlobalISel

Change-Id: I83ae012da3118b0a40fb8a80be5029ce5bd2d78a
---
 .../AMDGPU/AMDGPULateCodeGenPrepare.cpp       |    4 -
 .../AMDGPU/GlobalISel/vni8-across-blocks.ll   | 2067 ++---------------
 .../AMDGPU/GlobalISel/vni8-loop-carried.ll    |   37 +-
 3 files changed, 198 insertions(+), 1910 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
index d0e0977d7bb4e5..822b85fac5188f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
@@ -194,10 +194,6 @@ bool AMDGPULateCodeGenPrepare::runOnFunction(Function &F) {
   for (auto &BB : F)
     for (Instruction &I : llvm::make_early_inc_range(BB)) {
       Changed |= visit(I);
-      // GlobalISel should directly use the values, and do not need to emit
-      // CopyTo/CopyFrom Regs across blocks
-      if (TM.Options.EnableGlobalISel)
-        continue;
       if (!LRO.shouldReplaceUses(I))
         continue;
       Changed |= LRO.replaceUses(I);
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll
index cbb8fede31efa7..3def10e73717b3 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll
@@ -6,28 +6,36 @@ define amdgpu_kernel void @v3i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1)
 ; GFX906:       ; %bb.0: ; %entry
 ; GFX906-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX906-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX906-NEXT:    v_lshlrev_b32_e32 v4, 2, v0
-; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v0
+; GFX906-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; GFX906-NEXT:    v_mov_b32_e32 v3, 8
+; GFX906-NEXT:    v_mov_b32_e32 v5, 16
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX906-NEXT:    global_load_dword v1, v4, s[4:5]
+; GFX906-NEXT:    global_load_dword v4, v2, s[4:5]
+; GFX906-NEXT:    v_mov_b32_e32 v1, 0xff
+; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v0
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v1
-; GFX906-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GFX906-NEXT:    v_and_b32_e32 v6, 0xff, v4
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v7, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX906-NEXT:    v_or3_b32 v4, v6, v7, v4
 ; GFX906-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX906-NEXT:    s_cbranch_execz .LBB0_2
 ; GFX906-NEXT:  ; %bb.1: ; %bb.1
-; GFX906-NEXT:    global_load_dword v1, v4, s[6:7]
+; GFX906-NEXT:    global_load_dword v0, v2, s[6:7]
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v1
-; GFX906-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GFX906-NEXT:    v_and_b32_e32 v2, 0xff, v0
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v3, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX906-NEXT:    v_or3_b32 v4, v2, v3, v0
 ; GFX906-NEXT:  .LBB0_2: ; %bb.2
 ; GFX906-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX906-NEXT:    v_and_b32_e32 v0, 0xff, v3
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v4
+; GFX906-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX906-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    s_mov_b32 s0, 0xffff
+; GFX906-NEXT:    v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_and_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX906-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX906-NEXT:    v_and_b32_sdwa v1, s0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX906-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; GFX906-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX906-NEXT:    global_store_short v1, v0, s[2:3]
@@ -55,34 +63,19 @@ define amdgpu_kernel void @v4i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1)
 ; GFX906:       ; %bb.0: ; %entry
 ; GFX906-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX906-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 2, v0
+; GFX906-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v0
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX906-NEXT:    global_load_dword v1, v5, s[4:5]
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v2, 8, v1
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
-; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 24, v1
+; GFX906-NEXT:    global_load_dword v1, v2, s[4:5]
 ; GFX906-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX906-NEXT:    s_cbranch_execz .LBB1_2
 ; GFX906-NEXT:  ; %bb.1: ; %bb.1
-; GFX906-NEXT:    global_load_dword v1, v5, s[6:7]
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v2, 8, v1
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
-; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 24, v1
+; GFX906-NEXT:    global_load_dword v1, v2, s[6:7]
 ; GFX906-NEXT:  .LBB1_2: ; %bb.2
 ; GFX906-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX906-NEXT:    v_mov_b32_e32 v5, 8
-; GFX906-NEXT:    v_mov_b32_e32 v0, 0xff
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    v_and_or_b32 v0, v1, v0, v2
-; GFX906-NEXT:    v_and_b32_e32 v1, 0xff, v3
-; GFX906-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX906-NEXT:    v_lshlrev_b32_e32 v2, 24, v4
-; GFX906-NEXT:    v_or3_b32 v0, v0, v1, v2
-; GFX906-NEXT:    v_mov_b32_e32 v1, 0
-; GFX906-NEXT:    global_store_dword v1, v0, s[2:3]
+; GFX906-NEXT:    v_mov_b32_e32 v0, 0
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    global_store_dword v0, v1, s[2:3]
 ; GFX906-NEXT:    s_endpgm
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -106,30 +99,28 @@ define amdgpu_kernel void @v5i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1)
 ; GFX906:       ; %bb.0: ; %entry
 ; GFX906-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX906-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX906-NEXT:    v_lshlrev_b32_e32 v6, 3, v0
+; GFX906-NEXT:    v_lshlrev_b32_e32 v3, 3, v0
 ; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v0
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX906-NEXT:    global_load_dwordx2 v[1:2], v6, s[4:5]
+; GFX906-NEXT:    global_load_dwordx2 v[1:2], v3, s[4:5]
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v1
-; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
-; GFX906-NEXT:    v_lshrrev_b32_e32 v5, 24, v1
+; GFX906-NEXT:    v_and_b32_e32 v2, 0xff, v2
 ; GFX906-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX906-NEXT:    s_cbranch_execz .LBB2_2
 ; GFX906-NEXT:  ; %bb.1: ; %bb.1
-; GFX906-NEXT:    global_load_dwordx2 v[1:2], v6, s[6:7]
+; GFX906-NEXT:    global_load_dwordx2 v[1:2], v3, s[6:7]
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v1
-; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
-; GFX906-NEXT:    v_lshrrev_b32_e32 v5, 24, v1
+; GFX906-NEXT:    v_and_b32_e32 v2, 0xff, v2
 ; GFX906-NEXT:  .LBB2_2: ; %bb.2
 ; GFX906-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX906-NEXT:    v_mov_b32_e32 v0, 0
-; GFX906-NEXT:    global_store_byte v0, v1, s[2:3]
-; GFX906-NEXT:    global_store_byte v0, v3, s[2:3] offset:1
-; GFX906-NEXT:    global_store_byte v0, v4, s[2:3] offset:2
-; GFX906-NEXT:    global_store_byte v0, v5, s[2:3] offset:3
-; GFX906-NEXT:    global_store_byte v0, v2, s[2:3] offset:4
+; GFX906-NEXT:    v_mov_b32_e32 v4, 0
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v1
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v1
+; GFX906-NEXT:    global_store_byte v4, v1, s[2:3]
+; GFX906-NEXT:    global_store_byte v4, v0, s[2:3] offset:1
+; GFX906-NEXT:    global_store_byte_d16_hi v4, v1, s[2:3] offset:2
+; GFX906-NEXT:    global_store_byte v4, v3, s[2:3] offset:3
+; GFX906-NEXT:    global_store_byte v4, v2, s[2:3] offset:4
 ; GFX906-NEXT:    s_endpgm
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -153,46 +144,19 @@ define amdgpu_kernel void @v8i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1)
 ; GFX906:       ; %bb.0: ; %entry
 ; GFX906-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX906-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX906-NEXT:    v_lshlrev_b32_e32 v9, 3, v0
+; GFX906-NEXT:    v_lshlrev_b32_e32 v3, 3, v0
 ; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v0
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX906-NEXT:    global_load_dwordx2 v[1:2], v9, s[4:5]
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v1
-; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
-; GFX906-NEXT:    v_lshrrev_b32_e32 v5, 24, v1
-; GFX906-NEXT:    v_lshrrev_b32_e32 v6, 8, v2
-; GFX906-NEXT:    v_lshrrev_b32_e32 v7, 16, v2
-; GFX906-NEXT:    v_lshrrev_b32_e32 v8, 24, v2
+; GFX906-NEXT:    global_load_dwordx2 v[1:2], v3, s[4:5]
 ; GFX906-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX906-NEXT:    s_cbranch_execz .LBB3_2
 ; GFX906-NEXT:  ; %bb.1: ; %bb.1
-; GFX906-NEXT:    global_load_dwordx2 v[1:2], v9, s[6:7]
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v1
-; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
-; GFX906-NEXT:    v_lshrrev_b32_e32 v5, 24, v1
-; GFX906-NEXT:    v_lshrrev_b32_e32 v6, 8, v2
-; GFX906-NEXT:    v_lshrrev_b32_e32 v7, 16, v2
-; GFX906-NEXT:    v_lshrrev_b32_e32 v8, 24, v2
+; GFX906-NEXT:    global_load_dwordx2 v[1:2], v3, s[6:7]
 ; GFX906-NEXT:  .LBB3_2: ; %bb.2
 ; GFX906-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX906-NEXT:    v_mov_b32_e32 v10, 8
-; GFX906-NEXT:    v_mov_b32_e32 v9, 0xff
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v0, v10, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    v_and_or_b32 v0, v1, v9, v0
-; GFX906-NEXT:    v_and_b32_e32 v1, 0xff, v4
-; GFX906-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX906-NEXT:    v_lshlrev_b32_e32 v3, 24, v5
-; GFX906-NEXT:    v_or3_b32 v0, v0, v1, v3
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v1, v10, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    v_and_or_b32 v1, v2, v9, v1
-; GFX906-NEXT:    v_and_b32_e32 v2, 0xff, v7
-; GFX906-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX906-NEXT:    v_lshlrev_b32_e32 v3, 24, v8
-; GFX906-NEXT:    v_or3_b32 v1, v1, v2, v3
-; GFX906-NEXT:    v_mov_b32_e32 v2, 0
-; GFX906-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX906-NEXT:    v_mov_b32_e32 v0, 0
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    global_store_dwordx2 v0, v[1:2], s[2:3]
 ; GFX906-NEXT:    s_endpgm
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -216,70 +180,19 @@ define amdgpu_kernel void @v16i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1
 ; GFX906:       ; %bb.0: ; %entry
 ; GFX906-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX906-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX906-NEXT:    v_lshlrev_b32_e32 v17, 4, v0
+; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 4, v0
 ; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v0
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX906-NEXT:    global_load_dwordx4 v[1:4], v17, s[4:5]
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
-; GFX906-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
-; GFX906-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
-; GFX906-NEXT:    v_lshrrev_b32_e32 v8, 8, v2
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v2
-; GFX906-NEXT:    v_lshrrev_b32_e32 v10, 24, v2
-; GFX906-NEXT:    v_lshrrev_b32_e32 v11, 8, v3
-; GFX906-NEXT:    v_lshrrev_b32_e32 v12, 16, v3
-; GFX906-NEXT:    v_lshrrev_b32_e32 v13, 24, v3
-; GFX906-NEXT:    v_lshrrev_b32_e32 v14, 8, v4
-; GFX906-NEXT:    v_lshrrev_b32_e32 v15, 16, v4
-; GFX906-NEXT:    v_lshrrev_b32_e32 v16, 24, v4
+; GFX906-NEXT:    global_load_dwordx4 v[1:4], v5, s[4:5]
 ; GFX906-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX906-NEXT:    s_cbranch_execz .LBB4_2
 ; GFX906-NEXT:  ; %bb.1: ; %bb.1
-; GFX906-NEXT:    global_load_dwordx4 v[1:4], v17, s[6:7]
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
-; GFX906-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
-; GFX906-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
-; GFX906-NEXT:    v_lshrrev_b32_e32 v8, 8, v2
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v2
-; GFX906-NEXT:    v_lshrrev_b32_e32 v10, 24, v2
-; GFX906-NEXT:    v_lshrrev_b32_e32 v11, 8, v3
-; GFX906-NEXT:    v_lshrrev_b32_e32 v12, 16, v3
-; GFX906-NEXT:    v_lshrrev_b32_e32 v13, 24, v3
-; GFX906-NEXT:    v_lshrrev_b32_e32 v14, 8, v4
-; GFX906-NEXT:    v_lshrrev_b32_e32 v15, 16, v4
-; GFX906-NEXT:    v_lshrrev_b32_e32 v16, 24, v4
+; GFX906-NEXT:    global_load_dwordx4 v[1:4], v5, s[6:7]
 ; GFX906-NEXT:  .LBB4_2: ; %bb.2
 ; GFX906-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX906-NEXT:    v_mov_b32_e32 v18, 8
-; GFX906-NEXT:    v_mov_b32_e32 v17, 0xff
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v0, v18, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    v_and_or_b32 v0, v1, v17, v0
-; GFX906-NEXT:    v_and_b32_e32 v1, 0xff, v6
-; GFX906-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 24, v7
-; GFX906-NEXT:    v_or3_b32 v0, v0, v1, v5
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v1, v18, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    v_and_or_b32 v1, v2, v17, v1
-; GFX906-NEXT:    v_and_b32_e32 v2, 0xff, v9
-; GFX906-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 24, v10
-; GFX906-NEXT:    v_or3_b32 v1, v1, v2, v5
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v2, v18, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    v_and_or_b32 v2, v3, v17, v2
-; GFX906-NEXT:    v_and_b32_e32 v3, 0xff, v12
-; GFX906-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 24, v13
-; GFX906-NEXT:    v_or3_b32 v2, v2, v3, v5
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v3, v18, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    v_and_or_b32 v3, v4, v17, v3
-; GFX906-NEXT:    v_and_b32_e32 v4, 0xff, v15
-; GFX906-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 24, v16
-; GFX906-NEXT:    v_or3_b32 v3, v3, v4, v5
-; GFX906-NEXT:    v_mov_b32_e32 v4, 0
-; GFX906-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
+; GFX906-NEXT:    v_mov_b32_e32 v0, 0
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    global_store_dwordx4 v0, v[1:4], s[2:3]
 ; GFX906-NEXT:    s_endpgm
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -302,124 +215,24 @@ define amdgpu_kernel void @v32i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1
 ; GFX906-LABEL: v32i8_liveout:
 ; GFX906:       ; %bb.0: ; %entry
 ; GFX906-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX906-NEXT:    v_lshlrev_b32_e32 v32, 5, v0
-; GFX906-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX906-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX906-NEXT:    v_lshlrev_b32_e32 v9, 5, v0
 ; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v0
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX906-NEXT:    global_load_dwordx4 v[5:8], v32, s[4:5]
-; GFX906-NEXT:    global_load_dwordx4 v[1:4], v32, s[4:5] offset:16
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v31, 8, v5
-; GFX906-NEXT:    v_lshrrev_b32_e32 v26, 16, v5
-; GFX906-NEXT:    v_lshrrev_b32_e32 v27, 24, v5
-; GFX906-NEXT:    v_lshrrev_b32_e32 v28, 8, v6
-; GFX906-NEXT:    v_lshrrev_b32_e32 v29, 16, v6
-; GFX906-NEXT:    v_lshrrev_b32_e32 v30, 24, v6
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v7
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v7
-; GFX906-NEXT:    v_lshrrev_b32_e32 v10, 24, v7
-; GFX906-NEXT:    v_lshrrev_b32_e32 v11, 8, v8
-; GFX906-NEXT:    v_lshrrev_b32_e32 v12, 16, v8
-; GFX906-NEXT:    v_lshrrev_b32_e32 v13, 24, v8
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v14, 8, v1
-; GFX906-NEXT:    v_lshrrev_b32_e32 v15, 16, v1
-; GFX906-NEXT:    v_lshrrev_b32_e32 v16, 24, v1
-; GFX906-NEXT:    v_lshrrev_b32_e32 v17, 8, v2
-; GFX906-NEXT:    v_lshrrev_b32_e32 v18, 16, v2
-; GFX906-NEXT:    v_lshrrev_b32_e32 v19, 24, v2
-; GFX906-NEXT:    v_lshrrev_b32_e32 v20, 8, v3
-; GFX906-NEXT:    v_lshrrev_b32_e32 v21, 16, v3
-; GFX906-NEXT:    v_lshrrev_b32_e32 v22, 24, v3
-; GFX906-NEXT:    v_lshrrev_b32_e32 v23, 8, v4
-; GFX906-NEXT:    v_lshrrev_b32_e32 v24, 16, v4
-; GFX906-NEXT:    v_lshrrev_b32_e32 v25, 24, v4
-; GFX906-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX906-NEXT:    global_load_dwordx4 v[1:4], v9, s[4:5]
+; GFX906-NEXT:    global_load_dwordx4 v[5:8], v9, s[4:5] offset:16
+; GFX906-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX906-NEXT:    s_cbranch_execz .LBB5_2
 ; GFX906-NEXT:  ; %bb.1: ; %bb.1
-; GFX906-NEXT:    global_load_dwordx4 v[5:8], v32, s[6:7]
-; GFX906-NEXT:    global_load_dwordx4 v[1:4], v32, s[6:7] offset:16
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v31, 8, v5
-; GFX906-NEXT:    v_lshrrev_b32_e32 v26, 16, v5
-; GFX906-NEXT:    v_lshrrev_b32_e32 v27, 24, v5
-; GFX906-NEXT:    v_lshrrev_b32_e32 v28, 8, v6
-; GFX906-NEXT:    v_lshrrev_b32_e32 v29, 16, v6
-; GFX906-NEXT:    v_lshrrev_b32_e32 v30, 24, v6
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v7
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v7
-; GFX906-NEXT:    v_lshrrev_b32_e32 v10, 24, v7
-; GFX906-NEXT:    v_lshrrev_b32_e32 v11, 8, v8
-; GFX906-NEXT:    v_lshrrev_b32_e32 v12, 16, v8
-; GFX906-NEXT:    v_lshrrev_b32_e32 v13, 24, v8
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v14, 8, v1
-; GFX906-NEXT:    v_lshrrev_b32_e32 v15, 16, v1
-; GFX906-NEXT:    v_lshrrev_b32_e32 v16, 24, v1
-; GFX906-NEXT:    v_lshrrev_b32_e32 v17, 8, v2
-; GFX906-NEXT:    v_lshrrev_b32_e32 v18, 16, v2
-; GFX906-NEXT:    v_lshrrev_b32_e32 v19, 24, v2
-; GFX906-NEXT:    v_lshrrev_b32_e32 v20, 8, v3
-; GFX906-NEXT:    v_lshrrev_b32_e32 v21, 16, v3
-; GFX906-NEXT:    v_lshrrev_b32_e32 v22, 24, v3
-; GFX906-NEXT:    v_lshrrev_b32_e32 v23, 8, v4
-; GFX906-NEXT:    v_lshrrev_b32_e32 v24, 16, v4
-; GFX906-NEXT:    v_lshrrev_b32_e32 v25, 24, v4
+; GFX906-NEXT:    global_load_dwordx4 v[1:4], v9, s[6:7]
+; GFX906-NEXT:    global_load_dwordx4 v[5:8], v9, s[6:7] offset:16
 ; GFX906-NEXT:  .LBB5_2: ; %bb.2
-; GFX906-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX906-NEXT:    v_mov_b32_e32 v32, 8
-; GFX906-NEXT:    v_mov_b32_e32 v33, 0xff
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v0, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    v_and_or_b32 v0, v7, v33, v0
-; GFX906-NEXT:    v_and_b32_e32 v7, 0xff, v9
-; GFX906-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX906-NEXT:    v_lshlrev_b32_e32 v9, 24, v10
-; GFX906-NEXT:    v_or3_b32 v7, v0, v7, v9
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v0, v32, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    v_and_or_b32 v0, v8, v33, v0
-; GFX906-NEXT:    v_and_b32_e32 v8, 0xff, v12
-; GFX906-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX906-NEXT:    v_lshlrev_b32_e32 v9, 24, v13
-; GFX906-NEXT:    v_or3_b32 v8, v0, v8, v9
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v0, v32, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    v_and_or_b32 v0, v1, v33, v0
-; GFX906-NEXT:    v_and_b32_e32 v1, 0xff, v15
-; GFX906-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX906-NEXT:    v_lshlrev_b32_e32 v9, 24, v16
-; GFX906-NEXT:    v_or3_b32 v0, v0, v1, v9
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v1, v32, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    v_and_or_b32 v1, v2, v33, v1
-; GFX906-NEXT:    v_and_b32_e32 v2, 0xff, v18
-; GFX906-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX906-NEXT:    v_lshlrev_b32_e32 v9, 24, v19
-; GFX906-NEXT:    v_or3_b32 v1, v1, v2, v9
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v2, v32, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    v_and_or_b32 v2, v3, v33, v2
-; GFX906-NEXT:    v_and_b32_e32 v3, 0xff, v21
-; GFX906-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX906-NEXT:    v_lshlrev_b32_e32 v9, 24, v22
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v28, v32, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    v_or3_b32 v2, v2, v3, v9
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v3, v32, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v31, v32, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    v_and_b32_e32 v26, 0xff, v26
-; GFX906-NEXT:    v_and_or_b32 v6, v6, v33, v28
-; GFX906-NEXT:    v_and_b32_e32 v28, 0xff, v29
-; GFX906-NEXT:    v_and_or_b32 v3, v4, v33, v3
-; GFX906-NEXT:    v_and_b32_e32 v4, 0xff, v24
-; GFX906-NEXT:    v_and_or_b32 v5, v5, v33, v31
-; GFX906-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
-; GFX906-NEXT:    v_lshlrev_b32_e32 v27, 24, v27
-; GFX906-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
-; GFX906-NEXT:    v_lshlrev_b32_e32 v29, 24, v30
-; GFX906-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX906-NEXT:    v_lshlrev_b32_e32 v9, 24, v25
-; GFX906-NEXT:    v_or3_b32 v5, v5, v26, v27
-; GFX906-NEXT:    v_or3_b32 v6, v6, v28, v29
-; GFX906-NEXT:    v_or3_b32 v3, v3, v4, v9
-; GFX906-NEXT:    v_mov_b32_e32 v4, 0
-; GFX906-NEXT:    global_store_dwordx4 v4, v[5:8], s[0:1]
-; GFX906-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
+; GFX906-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX906-NEXT:    v_mov_b32_e32 v0, 0
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    global_store_dwordx4 v0, v[1:4], s[2:3]
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    global_store_dwordx4 v0, v[5:8], s[2:3] offset:16
 ; GFX906-NEXT:    s_endpgm
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -450,1654 +263,148 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace(
 ; GFX906-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
 ; GFX906-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
 ; GFX906-NEXT:    s_addc_u32 s9, s9, 0
+; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v0
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX906-NEXT:    global_load_dwordx4 v[5:8], v4, s[4:5]
-; GFX906-NEXT:    global_load_dwordx4 v[9:12], v4, s[4:5] offset:16
-; GFX906-NEXT:    global_load_dwordx4 v[29:32], v4, s[4:5] offset:32
-; GFX906-NEXT:    global_load_dwordx4 v[13:16], v4, s[4:5] offset:48
-; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v0
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v5
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:780 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v5
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:692 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v6
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:768 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v6
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:712 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v6
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:716 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v7
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:772 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v7
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:736 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v7
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:740 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v8
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:776 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v8
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:744 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v8
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:748 ; 4-byte Folded Spill
-; GFX906-NEXT:    s_waitcnt vmcnt(13)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v9
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:696 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v9
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:564 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v9
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:568 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v10
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:700 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v10
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:572 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v10
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:576 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v11
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:704 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v11
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:588 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v11
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:592 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v12
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:708 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v12
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:596 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:16 ; 4-byte Folded Spill
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    buffer_store_dword v10, off, s[8:11], 0 offset:20 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v11, off, s[8:11], 0 offset:24 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v12, off, s[8:11], 0 offset:28 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v12
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:600 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v29
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:752 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v29
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:636 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v29
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:640 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v30
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:756 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v30
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:668 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v30
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:672 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v31
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:760 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v31
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:676 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v31
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:680 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v32
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:764 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v32
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:684 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v32
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:688 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v13
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:652 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v13
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:500 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v13
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:504 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v14
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:656 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v14
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:524 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v14
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:528 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v15
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:660 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v15
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:548 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v15
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:552 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v16
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:664 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v16
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:556 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v16
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:560 ; 4-byte Folded Spill
-; GFX906-NEXT:    global_load_dwordx4 v[0:3], v4, s[4:5] offset:64
-; GFX906-NEXT:    s_nop 0
-; GFX906-NEXT:    global_load_dwordx4 v[45:48], v4, s[4:5] offset:80
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v0
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:720 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v0
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:580 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v0
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:584 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v1
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:724 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:604 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v1
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:608 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v2
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:728 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v2
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:628 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v2
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:632 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v3
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:732 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v3
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:644 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v3
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:648 ; 4-byte Folded Spill
-; GFX906-NEXT:    s_waitcnt vmcnt(12)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v45
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:612 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v45
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:468 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v45
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:472 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v46
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:616 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v46
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:476 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v46
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:480 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v47
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:620 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v47
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:508 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v47
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:512 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v48
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:624 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v48
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:516 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v48
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:520 ; 4-byte Folded Spill
-; GFX906-NEXT:    global_load_dwordx4 v[57:60], v4, s[4:5] offset:96
-; GFX906-NEXT:    global_load_dwordx4 v[41:44], v4, s[4:5] offset:112
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v57
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:532 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v57
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:404 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v57
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:408 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v58
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:536 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v58
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:428 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v58
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:432 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v59
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:540 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v59
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:452 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v59
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:456 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v60
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:544 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v60
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:460 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v60
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:464 ; 4-byte Folded Spill
-; GFX906-NEXT:    s_waitcnt vmcnt(12)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v41
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:436 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v41
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:308 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v41
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:312 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v42
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:440 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v42
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:332 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v42
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:336 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v43
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:444 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v43
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:356 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v43
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:360 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v44
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:448 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v44
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:364 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v44
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:368 ; 4-byte Folded Spill
-; GFX906-NEXT:    global_load_dwordx4 v[53:56], v4, s[4:5] offset:128
-; GFX906-NEXT:    global_load_dwordx4 v[25:28], v4, s[4:5] offset:144
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v53
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:484 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v53
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:372 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v53
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:376 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v54
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:488 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v54
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:380 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v54
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:384 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v55
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:492 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v55
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:412 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v55
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:416 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v56
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:496 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v56
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:420 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v56
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:424 ; 4-byte Folded Spill
-; GFX906-NEXT:    s_waitcnt vmcnt(12)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v25
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:388 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v25
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:276 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v25
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:280 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v26
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:392 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v26
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:284 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v26
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:288 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v27
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:396 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v27
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:316 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v27
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:320 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v28
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:400 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v28
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:324 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v28
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:328 ; 4-byte Folded Spill
-; GFX906-NEXT:    global_load_dwordx4 v[49:52], v4, s[4:5] offset:160
-; GFX906-NEXT:    global_load_dwordx4 v[21:24], v4, s[4:5] offset:176
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v49
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:340 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v49
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:164 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v49
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:168 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v50
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:344 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v50
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:236 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v50
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:240 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v51
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:348 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v51
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:260 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v51
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:264 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v52
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:352 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v52
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:268 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v52
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:272 ; 4-byte Folded Spill
-; GFX906-NEXT:    s_waitcnt vmcnt(12)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v21
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:244 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v21
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:80 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v21
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:32 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v22
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:248 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v22
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:36 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v22
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:40 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v23
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:252 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v23
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:44 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v23
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:48 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v24
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:256 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v24
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:52 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v24
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:56 ; 4-byte Folded Spill
-; GFX906-NEXT:    global_load_dwordx4 v[33:36], v4, s[4:5] offset:192
-; GFX906-NEXT:    global_load_dwordx4 v[17:20], v4, s[4:5] offset:208
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v33
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:292 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v33
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:60 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v33
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:64 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v34
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:296 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v34
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:108 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v34
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:112 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v35
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:300 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v35
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:172 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v35
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:176 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v36
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:304 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v36
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:228 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v36
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:232 ; 4-byte Folded Spill
-; GFX906-NEXT:    s_waitcnt vmcnt(12)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v17
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:116 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v17
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:120 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v17
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:124 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v18
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:128 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v18
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:132 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v18
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:136 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v19
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:140 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v19
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:144 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v19
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:148 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v20
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:152 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v20
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:156 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v20
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:160 ; 4-byte Folded Spill
-; GFX906-NEXT:    global_load_dwordx4 v[37:40], v4, s[4:5] offset:224
-; GFX906-NEXT:    s_nop 0
-; GFX906-NEXT:    global_load_dwordx4 v[9:12], v4, s[4:5] offset:240
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 8, v37
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:180 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 16, v37
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:184 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 24, v37
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:188 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 8, v38
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:192 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 16, v38
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:196 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 24, v38
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:200 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 8, v39
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:204 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 16, v39
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:208 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 24, v39
-; GFX906-NEXT:    s_waitcnt vmcnt(8)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v62, 8, v9
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:212 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 8, v40
-; GFX906-NEXT:    buffer_store_dword v62, off, s[8:11], 0 offset:68 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v62, 8, v11
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:216 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 16, v40
-; GFX906-NEXT:    buffer_store_dword v62, off, s[8:11], 0 offset:84 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v62, 16, v11
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:220 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 24, v40
-; GFX906-NEXT:    buffer_store_dword v62, off, s[8:11], 0 offset:96 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v62, 24, v11
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:224 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 16, v9
-; GFX906-NEXT:    buffer_store_dword v62, off, s[8:11], 0 offset:92 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v62, 8, v12
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:88 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v63, 24, v9
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 16, v10
-; GFX906-NEXT:    buffer_store_dword v62, off, s[8:11], 0 offset:104 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v62, 16, v12
-; GFX906-NEXT:    buffer_store_dword v63, off, s[8:11], 0 offset:72 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v63, 8, v10
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:76 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 24, v10
-; GFX906-NEXT:    buffer_store_dword v62, off, s[8:11], 0 offset:100 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v5, off, s[8:11], 0 ; 4-byte Folded Spill
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    buffer_store_dword v10, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v11, off, s[8:11], 0 offset:8 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v12, off, s[8:11], 0 offset:12 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v62, 16, v5
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v12
+; GFX906-NEXT:    buffer_store_dword v6, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v7, off, s[8:11], 0 offset:8 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v8, off, s[8:11], 0 offset:12 ; 4-byte Folded Spill
+; GFX906-NEXT:    global_load_dwordx4 v[5:8], v4, s[4:5] offset:16
+; GFX906-NEXT:    s_nop 0
+; GFX906-NEXT:    global_load_dwordx4 v[9:12], v4, s[4:5] offset:32
+; GFX906-NEXT:    global_load_dwordx4 v[13:16], v4, s[4:5] offset:48
+; GFX906-NEXT:    global_load_dwordx4 v[17:20], v4, s[4:5] offset:64
+; GFX906-NEXT:    global_load_dwordx4 v[21:24], v4, s[4:5] offset:80
+; GFX906-NEXT:    global_load_dwordx4 v[25:28], v4, s[4:5] offset:96
+; GFX906-NEXT:    global_load_dwordx4 v[29:32], v4, s[4:5] offset:112
+; GFX906-NEXT:    global_load_dwordx4 v[33:36], v4, s[4:5] offset:128
+; GFX906-NEXT:    global_load_dwordx4 v[37:40], v4, s[4:5] offset:144
+; GFX906-NEXT:    global_load_dwordx4 v[41:44], v4, s[4:5] offset:160
+; GFX906-NEXT:    global_load_dwordx4 v[45:48], v4, s[4:5] offset:176
+; GFX906-NEXT:    global_load_dwordx4 v[49:52], v4, s[4:5] offset:192
+; GFX906-NEXT:    global_load_dwordx4 v[53:56], v4, s[4:5] offset:208
+; GFX906-NEXT:    global_load_dwordx4 v[57:60], v4, s[4:5] offset:224
+; GFX906-NEXT:    global_load_dwordx4 v[0:3], v4, s[4:5] offset:240
 ; GFX906-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX906-NEXT:    s_cbranch_execz .LBB6_2
 ; GFX906-NEXT:  ; %bb.1: ; %bb.1
-; GFX906-NEXT:    global_load_dwordx4 v[5:8], v4, s[6:7]
-; GFX906-NEXT:    global_load_dwordx4 v[0:3], v4, s[6:7] offset:16
-; GFX906-NEXT:    global_load_dwordx4 v[29:32], v4, s[6:7] offset:32
-; GFX906-NEXT:    global_load_dwordx4 v[13:16], v4, s[6:7] offset:48
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v5
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:780 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v5
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:692 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v6
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:768 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v6
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:712 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v6
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:716 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v7
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:772 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v7
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:736 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v7
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:740 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v8
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:776 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v8
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:744 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v8
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:748 ; 4-byte Folded Spill
-; GFX906-NEXT:    s_waitcnt vmcnt(13)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v0
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:696 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v0
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:564 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v0
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:568 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v1
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:700 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:572 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v1
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:576 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v2
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:704 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v2
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:588 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v2
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:592 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v3
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:708 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v3
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:596 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:16 ; 4-byte Folded Spill
+; GFX906-NEXT:    global_load_dwordx4 v[0:3], v4, s[6:7]
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    buffer_store_dword v1, off, s[8:11], 0 offset:20 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:24 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:28 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v62, 16, v5
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v3
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:600 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v29
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:752 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v29
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:636 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v29
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:640 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v30
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:756 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v30
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:668 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v30
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:672 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v31
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:760 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v31
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:676 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v31
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:680 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v32
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:764 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v32
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:684 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v32
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:688 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v13
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:652 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v13
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:500 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v13
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:504 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v14
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:656 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v14
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:524 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v14
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:528 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v15
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:660 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v15
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:548 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v15
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:552 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v16
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:664 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v16
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:556 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v16
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:560 ; 4-byte Folded Spill
-; GFX906-NEXT:    global_load_dwordx4 v[0:3], v4, s[6:7] offset:64
-; GFX906-NEXT:    s_nop 0
-; GFX906-NEXT:    global_load_dwordx4 v[45:48], v4, s[6:7] offset:80
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v0
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:720 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v0
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:580 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v0
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:584 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v1
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:724 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:604 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v1
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:608 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v2
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:728 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v2
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:628 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v2
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:632 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v3
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:732 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v3
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:644 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v3
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:648 ; 4-byte Folded Spill
-; GFX906-NEXT:    s_waitcnt vmcnt(12)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v45
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:612 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v45
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:468 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v45
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:472 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v46
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:616 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v46
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:476 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v46
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:480 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v47
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:620 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v47
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:508 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v47
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:512 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v48
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:624 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v48
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:516 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v48
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:520 ; 4-byte Folded Spill
-; GFX906-NEXT:    global_load_dwordx4 v[57:60], v4, s[6:7] offset:96
-; GFX906-NEXT:    global_load_dwordx4 v[41:44], v4, s[6:7] offset:112
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v57
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:532 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v57
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:404 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v57
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:408 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v58
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:536 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v58
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:428 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v58
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:432 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v59
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:540 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v59
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:452 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v59
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:456 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v60
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:544 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v60
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:460 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v60
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:464 ; 4-byte Folded Spill
-; GFX906-NEXT:    s_waitcnt vmcnt(12)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v41
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:436 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v41
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:308 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v41
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:312 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v42
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:440 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v42
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:332 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v42
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:336 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v43
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:444 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v43
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:356 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v43
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:360 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v44
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:448 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v44
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:364 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v44
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:368 ; 4-byte Folded Spill
-; GFX906-NEXT:    global_load_dwordx4 v[53:56], v4, s[6:7] offset:128
-; GFX906-NEXT:    global_load_dwordx4 v[25:28], v4, s[6:7] offset:144
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v53
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:484 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v53
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:372 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v53
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:376 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v54
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:488 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v54
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:380 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v54
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:384 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v55
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:492 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v55
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:412 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v55
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:416 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v56
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:496 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v56
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:420 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v56
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:424 ; 4-byte Folded Spill
-; GFX906-NEXT:    s_waitcnt vmcnt(12)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v25
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:388 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v25
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:276 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v25
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:280 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v26
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:392 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v26
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:284 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v26
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:288 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v27
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:396 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v27
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:316 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v27
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:320 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v28
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:400 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v28
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:324 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v28
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:328 ; 4-byte Folded Spill
-; GFX906-NEXT:    global_load_dwordx4 v[49:52], v4, s[6:7] offset:160
-; GFX906-NEXT:    global_load_dwordx4 v[21:24], v4, s[6:7] offset:176
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v49
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:340 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v49
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:164 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v49
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:168 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v50
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:344 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v50
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:236 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v50
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:240 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v51
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:348 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v51
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:260 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v51
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:264 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v52
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:352 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v52
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:268 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v52
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:272 ; 4-byte Folded Spill
-; GFX906-NEXT:    s_waitcnt vmcnt(12)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v21
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:244 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v21
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:80 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v21
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:32 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v22
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:248 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v22
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:36 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v22
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:40 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v23
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:252 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v23
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:44 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v23
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:48 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v24
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:256 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v24
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:52 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v24
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:56 ; 4-byte Folded Spill
-; GFX906-NEXT:    global_load_dwordx4 v[33:36], v4, s[6:7] offset:192
-; GFX906-NEXT:    global_load_dwordx4 v[17:20], v4, s[6:7] offset:208
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v33
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:292 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v33
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:60 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v33
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:64 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v34
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:296 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v34
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:108 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v34
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:112 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v35
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:300 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v35
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:172 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v35
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:176 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v36
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:304 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v36
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:228 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v36
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:232 ; 4-byte Folded Spill
-; GFX906-NEXT:    s_waitcnt vmcnt(12)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v17
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:116 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v17
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:120 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v17
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:124 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v18
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:128 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v18
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:132 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v18
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:136 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v19
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:140 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v19
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:144 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v19
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:148 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v20
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:152 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v20
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:156 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v20
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:160 ; 4-byte Folded Spill
-; GFX906-NEXT:    global_load_dwordx4 v[37:40], v4, s[6:7] offset:224
-; GFX906-NEXT:    s_nop 0
-; GFX906-NEXT:    global_load_dwordx4 v[9:12], v4, s[6:7] offset:240
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 8, v37
-; GFX906-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:180 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 16, v37
-; GFX906-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:184 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 24, v37
-; GFX906-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:188 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 8, v38
-; GFX906-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:192 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 16, v38
-; GFX906-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:196 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 24, v38
-; GFX906-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:200 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 8, v39
-; GFX906-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:204 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 16, v39
-; GFX906-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:208 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 24, v39
-; GFX906-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:212 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 8, v40
-; GFX906-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:216 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 16, v40
-; GFX906-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:220 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 24, v40
-; GFX906-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:224 ; 4-byte Folded Spill
-; GFX906-NEXT:    s_waitcnt vmcnt(12)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 8, v9
-; GFX906-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:68 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 16, v9
-; GFX906-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:88 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 24, v9
-; GFX906-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:72 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 16, v10
-; GFX906-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:76 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 8, v11
-; GFX906-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:84 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 16, v11
-; GFX906-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:96 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 24, v11
-; GFX906-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:92 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 8, v12
-; GFX906-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:104 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 16, v12
-; GFX906-NEXT:    v_lshrrev_b32_e32 v63, 8, v10
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 24, v10
-; GFX906-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:100 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 ; 4-byte Folded Spill
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    buffer_store_dword v10, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v11, off, s[8:11], 0 offset:8 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v12, off, s[8:11], 0 offset:12 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v12
+; GFX906-NEXT:    buffer_store_dword v1, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:8 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:12 ; 4-byte Folded Spill
+; GFX906-NEXT:    global_load_dwordx4 v[5:8], v4, s[6:7] offset:16
+; GFX906-NEXT:    global_load_dwordx4 v[9:12], v4, s[6:7] offset:32
+; GFX906-NEXT:    global_load_dwordx4 v[13:16], v4, s[6:7] offset:48
+; GFX906-NEXT:    global_load_dwordx4 v[17:20], v4, s[6:7] offset:64
+; GFX906-NEXT:    global_load_dwordx4 v[21:24], v4, s[6:7] offset:80
+; GFX906-NEXT:    global_load_dwordx4 v[25:28], v4, s[6:7] offset:96
+; GFX906-NEXT:    global_load_dwordx4 v[29:32], v4, s[6:7] offset:112
+; GFX906-NEXT:    global_load_dwordx4 v[33:36], v4, s[6:7] offset:128
+; GFX906-NEXT:    global_load_dwordx4 v[37:40], v4, s[6:7] offset:144
+; GFX906-NEXT:    global_load_dwordx4 v[41:44], v4, s[6:7] offset:160
+; GFX906-NEXT:    global_load_dwordx4 v[45:48], v4, s[6:7] offset:176
+; GFX906-NEXT:    global_load_dwordx4 v[49:52], v4, s[6:7] offset:192
+; GFX906-NEXT:    global_load_dwordx4 v[53:56], v4, s[6:7] offset:208
+; GFX906-NEXT:    global_load_dwordx4 v[57:60], v4, s[6:7] offset:224
+; GFX906-NEXT:    global_load_dwordx4 v[0:3], v4, s[6:7] offset:240
 ; GFX906-NEXT:  .LBB6_2: ; %bb.2
 ; GFX906-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:784 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v63, off, s[8:11], 0 offset:788 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:792 ; 4-byte Folded Spill
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    buffer_store_dword v18, off, s[8:11], 0 offset:796 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v19, off, s[8:11], 0 offset:800 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v20, off, s[8:11], 0 offset:804 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_load_dword v10, off, s[8:11], 0 offset:780 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_mov_b32_e32 v4, 8
-; GFX906-NEXT:    v_mov_b32_e32 v63, 0xff
-; GFX906-NEXT:    v_mov_b32_e32 v18, v16
-; GFX906-NEXT:    v_mov_b32_e32 v17, v15
-; GFX906-NEXT:    v_mov_b32_e32 v16, v14
-; GFX906-NEXT:    v_mov_b32_e32 v15, v13
-; GFX906-NEXT:    v_mov_b32_e32 v19, v9
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v61, v4, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    buffer_load_dword v10, off, s[8:11], 0 offset:768 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_and_or_b32 v5, v5, v63, v61
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v61, v4, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    buffer_load_dword v10, off, s[8:11], 0 offset:772 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_and_or_b32 v6, v6, v63, v61
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v61, v4, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    buffer_load_dword v10, off, s[8:11], 0 offset:776 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_and_or_b32 v7, v7, v63, v61
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v61, v4, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    buffer_load_dword v10, off, s[8:11], 0 offset:692 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v11, off, s[8:11], 0 offset:16 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v12, off, s[8:11], 0 offset:20 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v13, off, s[8:11], 0 offset:24 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v14, off, s[8:11], 0 offset:28 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v9, off, s[8:11], 0 offset:712 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_and_or_b32 v8, v8, v63, v61
-; GFX906-NEXT:    v_and_b32_e32 v61, 0xff, v62
-; GFX906-NEXT:    v_lshlrev_b32_e32 v61, 16, v61
-; GFX906-NEXT:    s_waitcnt vmcnt(5)
-; GFX906-NEXT:    v_lshlrev_b32_e32 v62, 24, v10
-; GFX906-NEXT:    buffer_load_dword v10, off, s[8:11], 0 offset:716 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_and_b32_e32 v9, 0xff, v9
-; GFX906-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX906-NEXT:    v_or3_b32 v5, v5, v61, v62
-; GFX906-NEXT:    v_mov_b32_e32 v61, 0
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshlrev_b32_e32 v10, 24, v10
-; GFX906-NEXT:    v_or3_b32 v6, v6, v9, v10
-; GFX906-NEXT:    buffer_load_dword v9, off, s[8:11], 0 offset:736 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v10, off, s[8:11], 0 offset:740 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_and_b32_e32 v9, 0xff, v9
-; GFX906-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshlrev_b32_e32 v10, 24, v10
-; GFX906-NEXT:    v_or3_b32 v7, v7, v9, v10
-; GFX906-NEXT:    buffer_load_dword v9, off, s[8:11], 0 offset:744 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v10, off, s[8:11], 0 offset:748 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_and_b32_e32 v9, 0xff, v9
-; GFX906-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshlrev_b32_e32 v10, 24, v10
-; GFX906-NEXT:    v_or3_b32 v8, v8, v9, v10
-; GFX906-NEXT:    global_store_dwordx4 v61, v[5:8], s[2:3]
-; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:696 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_nop 0
-; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:700 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v9, off, s[8:11], 0 offset:564 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v10, off, s[8:11], 0 offset:568 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v7, off, s[8:11], 0 offset:704 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v8, off, s[8:11], 0 offset:708 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(5)
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v5, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    v_and_or_b32 v5, v11, v63, v5
-; GFX906-NEXT:    s_waitcnt vmcnt(4)
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v6, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    v_and_or_b32 v6, v12, v63, v6
-; GFX906-NEXT:    buffer_load_dword v11, off, s[8:11], 0 offset:572 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v12, off, s[8:11], 0 offset:576 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(5)
-; GFX906-NEXT:    v_and_b32_e32 v9, 0xff, v9
-; GFX906-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX906-NEXT:    s_waitcnt vmcnt(4)
-; GFX906-NEXT:    v_lshlrev_b32_e32 v10, 24, v10
-; GFX906-NEXT:    v_or3_b32 v5, v5, v9, v10
-; GFX906-NEXT:    buffer_load_dword v9, off, s[8:11], 0 offset:588 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v10, off, s[8:11], 0 offset:592 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(5)
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v7, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    v_and_or_b32 v7, v13, v63, v7
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_and_b32_e32 v11, 0xff, v11
-; GFX906-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_lshlrev_b32_e32 v12, 24, v12
-; GFX906-NEXT:    v_or3_b32 v6, v6, v11, v12
-; GFX906-NEXT:    buffer_load_dword v11, off, s[8:11], 0 offset:668 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v8, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_and_b32_e32 v9, 0xff, v9
-; GFX906-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b32_e32 v10, 24, v10
-; GFX906-NEXT:    v_or3_b32 v7, v7, v9, v10
-; GFX906-NEXT:    buffer_load_dword v9, off, s[8:11], 0 offset:596 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v10, off, s[8:11], 0 offset:600 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v12, off, s[8:11], 0 offset:672 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_and_or_b32 v8, v14, v63, v8
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_and_b32_e32 v11, 0xff, v11
-; GFX906-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_and_b32_e32 v9, 0xff, v9
-; GFX906-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b32_e32 v10, 24, v10
-; GFX906-NEXT:    v_or3_b32 v8, v8, v9, v10
-; GFX906-NEXT:    global_store_dwordx4 v61, v[5:8], s[2:3] offset:16
-; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:752 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_lshlrev_b32_e32 v12, 24, v12
-; GFX906-NEXT:    buffer_load_dword v7, off, s[8:11], 0 offset:760 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v9, off, s[8:11], 0 offset:636 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v10, off, s[8:11], 0 offset:640 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:756 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v8, off, s[8:11], 0 offset:764 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(5)
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v5, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    v_and_or_b32 v5, v29, v63, v5
-; GFX906-NEXT:    s_waitcnt vmcnt(4)
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v7, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_and_b32_e32 v9, 0xff, v9
-; GFX906-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_lshlrev_b32_e32 v10, 24, v10
-; GFX906-NEXT:    v_or3_b32 v5, v5, v9, v10
-; GFX906-NEXT:    buffer_load_dword v9, off, s[8:11], 0 offset:676 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v10, off, s[8:11], 0 offset:680 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_and_or_b32 v7, v31, v63, v7
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v6, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    v_and_or_b32 v6, v30, v63, v6
-; GFX906-NEXT:    v_or3_b32 v6, v6, v11, v12
-; GFX906-NEXT:    buffer_load_dword v11, off, s[8:11], 0 offset:524 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_and_b32_e32 v9, 0xff, v9
-; GFX906-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b32_e32 v10, 24, v10
-; GFX906-NEXT:    v_or3_b32 v7, v7, v9, v10
-; GFX906-NEXT:    buffer_load_dword v9, off, s[8:11], 0 offset:684 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v10, off, s[8:11], 0 offset:688 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v8, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    v_and_or_b32 v8, v32, v63, v8
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_and_b32_e32 v9, 0xff, v9
-; GFX906-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshlrev_b32_e32 v10, 24, v10
-; GFX906-NEXT:    v_or3_b32 v8, v8, v9, v10
-; GFX906-NEXT:    global_store_dwordx4 v61, v[5:8], s[2:3] offset:32
-; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:652 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_nop 0
-; GFX906-NEXT:    buffer_load_dword v7, off, s[8:11], 0 offset:660 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v9, off, s[8:11], 0 offset:500 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v10, off, s[8:11], 0 offset:504 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v12, off, s[8:11], 0 offset:528 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:656 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v8, off, s[8:11], 0 offset:664 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_and_b32_e32 v11, 0xff, v11
-; GFX906-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX906-NEXT:    s_waitcnt vmcnt(6)
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v5, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    v_and_or_b32 v5, v15, v63, v5
-; GFX906-NEXT:    s_waitcnt vmcnt(5)
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v7, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    s_waitcnt vmcnt(4)
-; GFX906-NEXT:    v_and_b32_e32 v9, 0xff, v9
-; GFX906-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_lshlrev_b32_e32 v10, 24, v10
-; GFX906-NEXT:    v_or3_b32 v5, v5, v9, v10
-; GFX906-NEXT:    buffer_load_dword v9, off, s[8:11], 0 offset:548 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v10, off, s[8:11], 0 offset:552 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_and_or_b32 v7, v17, v63, v7
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v6, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    v_and_or_b32 v6, v16, v63, v6
-; GFX906-NEXT:    v_lshlrev_b32_e32 v12, 24, v12
-; GFX906-NEXT:    v_or3_b32 v6, v6, v11, v12
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_and_b32_e32 v9, 0xff, v9
-; GFX906-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshlrev_b32_e32 v10, 24, v10
-; GFX906-NEXT:    v_or3_b32 v7, v7, v9, v10
-; GFX906-NEXT:    buffer_load_dword v9, off, s[8:11], 0 offset:556 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v10, off, s[8:11], 0 offset:560 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v8, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    v_and_or_b32 v8, v18, v63, v8
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_and_b32_e32 v9, 0xff, v9
-; GFX906-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshlrev_b32_e32 v10, 24, v10
-; GFX906-NEXT:    v_or3_b32 v8, v8, v9, v10
-; GFX906-NEXT:    global_store_dwordx4 v61, v[5:8], s[2:3] offset:48
-; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:720 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v5, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    v_and_or_b32 v0, v0, v63, v5
-; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:724 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:584 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v7, off, s[8:11], 0 offset:604 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v8, off, s[8:11], 0 offset:608 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v5, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    v_and_or_b32 v1, v1, v63, v5
-; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:728 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b32_e32 v8, 24, v8
-; GFX906-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX906-NEXT:    v_or3_b32 v1, v1, v7, v8
-; GFX906-NEXT:    buffer_load_dword v7, off, s[8:11], 0 offset:476 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v5, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    v_and_or_b32 v2, v2, v63, v5
-; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:732 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v5, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    v_and_or_b32 v3, v3, v63, v5
-; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:580 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX906-NEXT:    v_or3_b32 v0, v0, v5, v6
-; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:628 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:632 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; GFX906-NEXT:    v_or3_b32 v2, v2, v5, v6
-; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:644 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:648 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; GFX906-NEXT:    v_or3_b32 v3, v3, v5, v6
-; GFX906-NEXT:    global_store_dwordx4 v61, v[0:3], s[2:3] offset:64
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:612 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_nop 0
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:620 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:468 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:472 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v8, off, s[8:11], 0 offset:480 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:616 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:624 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX906-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX906-NEXT:    s_waitcnt vmcnt(6)
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    v_and_or_b32 v0, v45, v63, v0
-; GFX906-NEXT:    s_waitcnt vmcnt(5)
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    s_waitcnt vmcnt(4)
-; GFX906-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; GFX906-NEXT:    v_or3_b32 v0, v0, v5, v6
-; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:508 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:512 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_and_or_b32 v2, v47, v63, v2
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    v_and_or_b32 v1, v46, v63, v1
-; GFX906-NEXT:    v_lshlrev_b32_e32 v8, 24, v8
-; GFX906-NEXT:    v_or3_b32 v1, v1, v7, v8
-; GFX906-NEXT:    buffer_load_dword v7, off, s[8:11], 0 offset:428 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; GFX906-NEXT:    v_or3_b32 v2, v2, v5, v6
-; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:516 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:520 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    v_and_or_b32 v3, v48, v63, v3
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; GFX906-NEXT:    v_or3_b32 v3, v3, v5, v6
-; GFX906-NEXT:    global_store_dwordx4 v61, v[0:3], s[2:3] offset:80
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:532 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_nop 0
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:540 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:404 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:408 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v8, off, s[8:11], 0 offset:432 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:536 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:544 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX906-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX906-NEXT:    s_waitcnt vmcnt(6)
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    v_and_or_b32 v0, v57, v63, v0
-; GFX906-NEXT:    s_waitcnt vmcnt(5)
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    s_waitcnt vmcnt(4)
-; GFX906-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; GFX906-NEXT:    v_or3_b32 v0, v0, v5, v6
-; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:452 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:456 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_and_or_b32 v2, v59, v63, v2
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    v_and_or_b32 v1, v58, v63, v1
-; GFX906-NEXT:    v_lshlrev_b32_e32 v8, 24, v8
-; GFX906-NEXT:    v_or3_b32 v1, v1, v7, v8
-; GFX906-NEXT:    buffer_load_dword v7, off, s[8:11], 0 offset:332 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; GFX906-NEXT:    v_or3_b32 v2, v2, v5, v6
-; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:460 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:464 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    v_and_or_b32 v3, v60, v63, v3
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; GFX906-NEXT:    v_or3_b32 v3, v3, v5, v6
-; GFX906-NEXT:    global_store_dwordx4 v61, v[0:3], s[2:3] offset:96
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:436 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_nop 0
-; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:440 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:308 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:312 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v8, off, s[8:11], 0 offset:336 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:444 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:448 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX906-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX906-NEXT:    s_waitcnt vmcnt(6)
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    v_and_or_b32 v0, v41, v63, v0
-; GFX906-NEXT:    s_waitcnt vmcnt(5)
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    s_waitcnt vmcnt(4)
-; GFX906-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; GFX906-NEXT:    v_or3_b32 v0, v0, v5, v6
-; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:356 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:360 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_and_or_b32 v1, v42, v63, v1
-; GFX906-NEXT:    s_waitcnt vmcnt(4)
-; GFX906-NEXT:    v_lshlrev_b32_e32 v8, 24, v8
-; GFX906-NEXT:    v_or3_b32 v1, v1, v7, v8
-; GFX906-NEXT:    buffer_load_dword v7, off, s[8:11], 0 offset:380 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(4)
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    v_and_or_b32 v2, v43, v63, v2
-; GFX906-NEXT:    buffer_load_dword v8, off, s[8:11], 0 offset:384 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(4)
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    v_and_or_b32 v3, v44, v63, v3
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; GFX906-NEXT:    v_or3_b32 v2, v2, v5, v6
-; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:364 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:368 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX906-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_lshlrev_b32_e32 v8, 24, v8
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; GFX906-NEXT:    v_or3_b32 v3, v3, v5, v6
-; GFX906-NEXT:    global_store_dwordx4 v61, v[0:3], s[2:3] offset:112
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:484 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_nop 0
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:492 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:372 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:376 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:488 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:496 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(5)
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    v_and_or_b32 v0, v53, v63, v0
-; GFX906-NEXT:    s_waitcnt vmcnt(4)
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; GFX906-NEXT:    v_or3_b32 v0, v0, v5, v6
-; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:412 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:416 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_and_or_b32 v2, v55, v63, v2
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    v_and_or_b32 v1, v54, v63, v1
-; GFX906-NEXT:    v_or3_b32 v1, v1, v7, v8
-; GFX906-NEXT:    buffer_load_dword v7, off, s[8:11], 0 offset:284 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; GFX906-NEXT:    v_or3_b32 v2, v2, v5, v6
-; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:420 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:424 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    v_and_or_b32 v3, v56, v63, v3
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; GFX906-NEXT:    v_or3_b32 v3, v3, v5, v6
-; GFX906-NEXT:    global_store_dwordx4 v61, v[0:3], s[2:3] offset:128
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:388 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_nop 0
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:396 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:276 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:280 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v8, off, s[8:11], 0 offset:288 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:392 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:400 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX906-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX906-NEXT:    s_waitcnt vmcnt(6)
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    v_and_or_b32 v0, v25, v63, v0
-; GFX906-NEXT:    s_waitcnt vmcnt(5)
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    s_waitcnt vmcnt(4)
-; GFX906-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; GFX906-NEXT:    v_or3_b32 v0, v0, v5, v6
-; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:316 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:320 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_and_or_b32 v2, v27, v63, v2
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    v_and_or_b32 v1, v26, v63, v1
-; GFX906-NEXT:    v_lshlrev_b32_e32 v8, 24, v8
-; GFX906-NEXT:    v_or3_b32 v1, v1, v7, v8
-; GFX906-NEXT:    buffer_load_dword v7, off, s[8:11], 0 offset:236 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; GFX906-NEXT:    v_or3_b32 v2, v2, v5, v6
-; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:324 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:328 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    v_and_or_b32 v3, v28, v63, v3
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; GFX906-NEXT:    v_or3_b32 v3, v3, v5, v6
-; GFX906-NEXT:    global_store_dwordx4 v61, v[0:3], s[2:3] offset:144
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:340 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_nop 0
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:348 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:164 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:168 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v8, off, s[8:11], 0 offset:240 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:344 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:352 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(6)
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    v_and_or_b32 v0, v49, v63, v0
-; GFX906-NEXT:    s_waitcnt vmcnt(5)
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    s_waitcnt vmcnt(4)
-; GFX906-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; GFX906-NEXT:    v_or3_b32 v0, v0, v5, v6
-; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:260 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:264 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_and_or_b32 v2, v51, v63, v2
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    v_and_or_b32 v1, v50, v63, v1
-; GFX906-NEXT:    v_lshlrev_b32_e32 v8, 24, v8
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; GFX906-NEXT:    v_or3_b32 v2, v2, v5, v6
-; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:268 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:272 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    v_and_or_b32 v3, v52, v63, v3
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; GFX906-NEXT:    v_or3_b32 v3, v3, v5, v6
-; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:80 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX906-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX906-NEXT:    v_or3_b32 v1, v1, v7, v8
-; GFX906-NEXT:    buffer_load_dword v8, off, s[8:11], 0 offset:40 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:32 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v7, off, s[8:11], 0 offset:36 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX906-NEXT:    global_store_dwordx4 v61, v[0:3], s[2:3] offset:160
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:244 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:248 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(4)
-; GFX906-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:252 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(4)
-; GFX906-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX906-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX906-NEXT:    v_lshlrev_b32_e32 v8, 24, v8
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:256 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    v_and_or_b32 v0, v21, v63, v0
-; GFX906-NEXT:    v_or3_b32 v0, v0, v5, v6
-; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:44 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:48 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(4)
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    v_and_or_b32 v1, v22, v63, v1
-; GFX906-NEXT:    v_or3_b32 v1, v1, v7, v8
-; GFX906-NEXT:    buffer_load_dword v7, off, s[8:11], 0 offset:108 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(4)
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    v_and_or_b32 v2, v23, v63, v2
-; GFX906-NEXT:    buffer_load_dword v8, off, s[8:11], 0 offset:112 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(4)
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    v_and_or_b32 v3, v24, v63, v3
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; GFX906-NEXT:    v_or3_b32 v2, v2, v5, v6
-; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:52 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:56 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX906-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_lshlrev_b32_e32 v8, 24, v8
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; GFX906-NEXT:    v_or3_b32 v3, v3, v5, v6
-; GFX906-NEXT:    global_store_dwordx4 v61, v[0:3], s[2:3] offset:176
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:292 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_nop 0
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:300 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:60 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:64 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:296 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:304 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(5)
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    v_and_or_b32 v0, v33, v63, v0
-; GFX906-NEXT:    s_waitcnt vmcnt(4)
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; GFX906-NEXT:    v_or3_b32 v0, v0, v5, v6
-; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:172 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:176 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_and_or_b32 v2, v35, v63, v2
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    v_and_or_b32 v1, v34, v63, v1
-; GFX906-NEXT:    v_or3_b32 v1, v1, v7, v8
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; GFX906-NEXT:    v_or3_b32 v2, v2, v5, v6
-; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:228 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:232 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    v_and_or_b32 v3, v36, v63, v3
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; GFX906-NEXT:    v_or3_b32 v3, v3, v5, v6
-; GFX906-NEXT:    global_store_dwordx4 v61, v[0:3], s[2:3] offset:192
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:116 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_nop 0
-; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:792 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:796 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v7, off, s[8:11], 0 offset:800 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v8, off, s[8:11], 0 offset:804 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:120 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:124 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:136 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(7)
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_and_or_b32 v0, v5, v63, v0
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX906-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
-; GFX906-NEXT:    v_or3_b32 v0, v0, v1, v2
-; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:128 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:132 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:148 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX906-NEXT:    v_and_or_b32 v1, v6, v63, v1
-; GFX906-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX906-NEXT:    v_or3_b32 v1, v1, v2, v3
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:140 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:144 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:160 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX906-NEXT:    v_and_or_b32 v2, v7, v63, v2
-; GFX906-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX906-NEXT:    v_or3_b32 v2, v2, v3, v5
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:152 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:156 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX906-NEXT:    v_and_or_b32 v3, v8, v63, v3
-; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX906-NEXT:    v_or3_b32 v3, v3, v5, v6
-; GFX906-NEXT:    global_store_dwordx4 v61, v[0:3], s[2:3] offset:208
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:180 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_nop 0
-; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:184 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:188 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:200 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:224 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:212 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(5)
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    v_and_or_b32 v0, v37, v63, v0
-; GFX906-NEXT:    s_waitcnt vmcnt(4)
-; GFX906-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX906-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
-; GFX906-NEXT:    v_or3_b32 v0, v0, v1, v2
-; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:192 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:196 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(4)
-; GFX906-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
-; GFX906-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX906-NEXT:    v_and_or_b32 v1, v38, v63, v1
-; GFX906-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX906-NEXT:    v_or3_b32 v1, v1, v2, v3
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:204 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:208 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX906-NEXT:    v_and_or_b32 v2, v39, v63, v2
-; GFX906-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX906-NEXT:    v_or3_b32 v2, v2, v3, v5
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:216 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:220 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:16 ; 4-byte Folded Spill
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX906-NEXT:    v_and_or_b32 v3, v40, v63, v3
-; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX906-NEXT:    v_or3_b32 v3, v3, v5, v6
-; GFX906-NEXT:    global_store_dwordx4 v61, v[0:3], s[2:3] offset:224
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:68 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_nop 0
+; GFX906-NEXT:    buffer_store_dword v1, off, s[8:11], 0 offset:20 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:24 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:28 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_mov_b32_e32 v0, v57
+; GFX906-NEXT:    v_mov_b32_e32 v1, v58
+; GFX906-NEXT:    v_mov_b32_e32 v2, v59
+; GFX906-NEXT:    v_mov_b32_e32 v3, v60
+; GFX906-NEXT:    v_mov_b32_e32 v60, v56
+; GFX906-NEXT:    v_mov_b32_e32 v59, v55
+; GFX906-NEXT:    v_mov_b32_e32 v58, v54
+; GFX906-NEXT:    v_mov_b32_e32 v57, v53
+; GFX906-NEXT:    v_mov_b32_e32 v56, v52
+; GFX906-NEXT:    v_mov_b32_e32 v55, v51
+; GFX906-NEXT:    v_mov_b32_e32 v54, v50
+; GFX906-NEXT:    v_mov_b32_e32 v53, v49
+; GFX906-NEXT:    v_mov_b32_e32 v52, v48
+; GFX906-NEXT:    v_mov_b32_e32 v51, v47
+; GFX906-NEXT:    v_mov_b32_e32 v50, v46
+; GFX906-NEXT:    v_mov_b32_e32 v49, v45
+; GFX906-NEXT:    v_mov_b32_e32 v48, v44
+; GFX906-NEXT:    v_mov_b32_e32 v47, v43
+; GFX906-NEXT:    v_mov_b32_e32 v46, v42
+; GFX906-NEXT:    v_mov_b32_e32 v45, v41
+; GFX906-NEXT:    v_mov_b32_e32 v44, v40
+; GFX906-NEXT:    v_mov_b32_e32 v43, v39
+; GFX906-NEXT:    v_mov_b32_e32 v42, v38
+; GFX906-NEXT:    v_mov_b32_e32 v41, v37
+; GFX906-NEXT:    v_mov_b32_e32 v40, v36
+; GFX906-NEXT:    v_mov_b32_e32 v39, v35
+; GFX906-NEXT:    v_mov_b32_e32 v38, v34
+; GFX906-NEXT:    v_mov_b32_e32 v37, v33
+; GFX906-NEXT:    v_mov_b32_e32 v36, v32
+; GFX906-NEXT:    v_mov_b32_e32 v35, v31
+; GFX906-NEXT:    v_mov_b32_e32 v34, v30
+; GFX906-NEXT:    v_mov_b32_e32 v33, v29
+; GFX906-NEXT:    v_mov_b32_e32 v32, v28
+; GFX906-NEXT:    v_mov_b32_e32 v31, v27
+; GFX906-NEXT:    v_mov_b32_e32 v30, v26
+; GFX906-NEXT:    v_mov_b32_e32 v29, v25
+; GFX906-NEXT:    v_mov_b32_e32 v28, v24
+; GFX906-NEXT:    v_mov_b32_e32 v27, v23
+; GFX906-NEXT:    v_mov_b32_e32 v26, v22
+; GFX906-NEXT:    v_mov_b32_e32 v25, v21
+; GFX906-NEXT:    v_mov_b32_e32 v24, v20
+; GFX906-NEXT:    v_mov_b32_e32 v23, v19
+; GFX906-NEXT:    v_mov_b32_e32 v22, v18
+; GFX906-NEXT:    v_mov_b32_e32 v21, v17
+; GFX906-NEXT:    v_mov_b32_e32 v20, v16
+; GFX906-NEXT:    v_mov_b32_e32 v19, v15
+; GFX906-NEXT:    v_mov_b32_e32 v18, v14
+; GFX906-NEXT:    v_mov_b32_e32 v17, v13
+; GFX906-NEXT:    v_mov_b32_e32 v16, v12
+; GFX906-NEXT:    v_mov_b32_e32 v15, v11
+; GFX906-NEXT:    v_mov_b32_e32 v14, v10
+; GFX906-NEXT:    v_mov_b32_e32 v13, v9
+; GFX906-NEXT:    v_mov_b32_e32 v12, v8
+; GFX906-NEXT:    v_mov_b32_e32 v11, v7
+; GFX906-NEXT:    v_mov_b32_e32 v10, v6
+; GFX906-NEXT:    v_mov_b32_e32 v9, v5
 ; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 ; 4-byte Folded Reload
 ; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:4 ; 4-byte Folded Reload
 ; GFX906-NEXT:    buffer_load_dword v7, off, s[8:11], 0 offset:8 ; 4-byte Folded Reload
 ; GFX906-NEXT:    buffer_load_dword v8, off, s[8:11], 0 offset:12 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:88 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:784 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:72 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(7)
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_and_or_b32 v0, v5, v63, v0
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX906-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
-; GFX906-NEXT:    v_or3_b32 v0, v0, v1, v2
-; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:788 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:92 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:76 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    v_and_or_b32 v1, v6, v63, v1
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX906-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX906-NEXT:    v_or3_b32 v1, v1, v2, v3
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:84 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:96 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX906-NEXT:    v_and_or_b32 v2, v7, v63, v2
-; GFX906-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX906-NEXT:    v_or3_b32 v2, v2, v3, v5
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:104 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 24, v19
+; GFX906-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    buffer_load_dword v4, off, s[8:11], 0 offset:100 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_and_or_b32 v3, v8, v63, v3
+; GFX906-NEXT:    global_store_dwordx4 v4, v[5:8], s[2:3]
+; GFX906-NEXT:    global_store_dwordx4 v4, v[9:12], s[2:3] offset:16
+; GFX906-NEXT:    global_store_dwordx4 v4, v[13:16], s[2:3] offset:32
+; GFX906-NEXT:    global_store_dwordx4 v4, v[17:20], s[2:3] offset:48
+; GFX906-NEXT:    global_store_dwordx4 v4, v[21:24], s[2:3] offset:64
+; GFX906-NEXT:    global_store_dwordx4 v4, v[25:28], s[2:3] offset:80
+; GFX906-NEXT:    global_store_dwordx4 v4, v[29:32], s[2:3] offset:96
+; GFX906-NEXT:    global_store_dwordx4 v4, v[33:36], s[2:3] offset:112
+; GFX906-NEXT:    global_store_dwordx4 v4, v[37:40], s[2:3] offset:128
+; GFX906-NEXT:    global_store_dwordx4 v4, v[41:44], s[2:3] offset:144
+; GFX906-NEXT:    global_store_dwordx4 v4, v[45:48], s[2:3] offset:160
+; GFX906-NEXT:    global_store_dwordx4 v4, v[49:52], s[2:3] offset:176
+; GFX906-NEXT:    global_store_dwordx4 v4, v[53:56], s[2:3] offset:192
+; GFX906-NEXT:    global_store_dwordx4 v4, v[57:60], s[2:3] offset:208
+; GFX906-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:224
+; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:16 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_nop 0
+; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:20 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:24 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:28 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX906-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX906-NEXT:    v_or3_b32 v3, v3, v4, v5
-; GFX906-NEXT:    global_store_dwordx4 v61, v[0:3], s[2:3] offset:240
+; GFX906-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:240
 ; GFX906-NEXT:    s_endpgm
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-loop-carried.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-loop-carried.ll
index 95c541e2e60b7f..ffc91815821a17 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-loop-carried.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-loop-carried.ll
@@ -6,47 +6,32 @@ define amdgpu_kernel void @v32i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1
 ; GFX906:       ; %bb.0: ; %entry
 ; GFX906-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
 ; GFX906-NEXT:    v_lshlrev_b32_e32 v1, 5, v0
+; GFX906-NEXT:    v_mov_b32_e32 v3, 8
+; GFX906-NEXT:    v_mov_b32_e32 v2, 0xff
 ; GFX906-NEXT:    v_cmp_le_u32_e32 vcc, 15, v0
-; GFX906-NEXT:    v_mov_b32_e32 v3, 0xff
-; GFX906-NEXT:    v_mov_b32_e32 v4, 8
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX906-NEXT:    global_load_dword v1, v1, s[2:3]
 ; GFX906-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
-; GFX906-NEXT:    v_lshrrev_b32_e32 v6, 8, v1
-; GFX906-NEXT:    v_lshrrev_b32_e32 v2, 24, v1
-; GFX906-NEXT:    v_mov_b32_e32 v5, v0
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v0, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX906-NEXT:    v_and_or_b32 v0, v1, v2, v0
+; GFX906-NEXT:    v_mov_b32_e32 v2, 24
 ; GFX906-NEXT:  .LBB0_1: ; %bb.1
 ; GFX906-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v6, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX906-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX906-NEXT:    v_and_or_b32 v6, v1, v3, v6
-; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX906-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
+; GFX906-NEXT:    v_and_b32_e32 v3, 0xff, v1
 ; GFX906-NEXT:    s_and_b64 s[4:5], exec, vcc
-; GFX906-NEXT:    v_or3_b32 v5, v6, v5, v2
+; GFX906-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX906-NEXT:    s_or_b64 s[2:3], s[4:5], s[2:3]
-; GFX906-NEXT:    v_lshrrev_b32_e32 v2, 16, v5
-; GFX906-NEXT:    v_mov_b32_e32 v6, v0
+; GFX906-NEXT:    v_or3_b32 v1, v0, v3, v1
 ; GFX906-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX906-NEXT:    s_cbranch_execnz .LBB0_1
 ; GFX906-NEXT:  ; %bb.2: ; %bb.2.loopexit
 ; GFX906-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX906-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX906-NEXT:    v_mov_b32_e32 v4, 8
-; GFX906-NEXT:    v_mov_b32_e32 v3, 0xff
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    v_and_or_b32 v0, v1, v3, v0
-; GFX906-NEXT:    v_and_b32_e32 v1, 0xff, v5
-; GFX906-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX906-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX906-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
-; GFX906-NEXT:    v_or3_b32 v0, v0, v1, v2
-; GFX906-NEXT:    v_mov_b32_e32 v1, 0
+; GFX906-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX906-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX906-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX906-NEXT:    s_endpgm
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()

>From 35c5eb531c2b6d2c732ecc61e09c4713b6381fd2 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Fri, 19 Apr 2024 11:09:29 -0700
Subject: [PATCH 10/11] remove unintentional changes

Change-Id: Idbfbbadfc1c3cee6cbd1a814b3446628dcce4394
---
 llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index a37302d3c41267..6e7d34f5adaa3f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -107,7 +107,6 @@ class AMDGPUCodeGenPrepareImpl
   Module *Mod = nullptr;
   const DataLayout *DL = nullptr;
   bool HasUnsafeFPMath = false;
-  bool UsesGlobalISel = false;
   bool HasFP32DenormalFlush = false;
   bool FlowChanged = false;
   mutable Function *SqrtF32 = nullptr;
@@ -361,7 +360,6 @@ bool AMDGPUCodeGenPrepareImpl::run(Function &F) {
       Next = std::next(I);
 
       MadeChange |= visit(*I);
-      I->getType();
 
       if (Next != E) { // Control flow changed
         BasicBlock *NextInstBB = Next->getParent();
@@ -373,7 +371,6 @@ bool AMDGPUCodeGenPrepareImpl::run(Function &F) {
       }
     }
   }
-
   return MadeChange;
 }
 
@@ -2278,7 +2275,6 @@ bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) {
   Impl.ST = &TM.getSubtarget<GCNSubtarget>(F);
   Impl.AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
   Impl.UA = &getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
-  Impl.UsesGlobalISel = TM.Options.EnableGlobalISel;
   auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
   Impl.DT = DTWP ? &DTWP->getDomTree() : nullptr;
   Impl.HasUnsafeFPMath = hasUnsafeFPMath(F);
@@ -2301,7 +2297,6 @@ PreservedAnalyses AMDGPUCodeGenPreparePass::run(Function &F,
   Impl.DT = FAM.getCachedResult<DominatorTreeAnalysis>(F);
   Impl.HasUnsafeFPMath = hasUnsafeFPMath(F);
   SIModeRegisterDefaults Mode(F, *Impl.ST);
-  Impl.UsesGlobalISel = TM.Options.EnableGlobalISel;
   Impl.HasFP32DenormalFlush =
       Mode.FP32Denormals == DenormalMode::getPreserveSign();
   PreservedAnalyses PA = PreservedAnalyses::none();

>From efe24b60b11f7e1acb92689a7d5445546b40110d Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Tue, 30 Apr 2024 13:14:12 -0700
Subject: [PATCH 11/11] Review comments

Change-Id: I244784728ff1b4363ff066f8c5a6fa6d03c2a4d5
---
 .../AMDGPU/AMDGPULateCodeGenPrepare.cpp       | 203 +++++++++---------
 .../test/CodeGen/AMDGPU/vni8-across-blocks.ll |  53 ++++-
 2 files changed, 152 insertions(+), 104 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
index 822b85fac5188f..d7d2ebff03b6b7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
@@ -126,16 +126,17 @@ class ConversionCandidateInfo {
         NewType(NewType), ConvertBuilder(InsertBlock, InsertPt) {}
 };
 
+typedef std::pair<Instruction *, BasicBlock *> IncomingPair;
+typedef std::pair<Instruction *, SmallVector<IncomingPair, 4>> PHIUpdateInfo;
+
 class LiveRegOptimizer {
 private:
   Module *Mod = nullptr;
+  const DataLayout *DL = nullptr;
   // The scalar type to convert to
   Type *ConvertToScalar;
   // Holds the collection of PHIs with their pending new operands
-  SmallVector<std::pair<Instruction *,
-                        SmallVector<std::pair<Instruction *, BasicBlock *>, 4>>,
-              4>
-      PHIUpdater;
+  SmallVector<PHIUpdateInfo, 4> PHIUpdater;
 
 public:
   // Should the def of the instruction be converted if it is live across blocks
@@ -157,6 +158,7 @@ class LiveRegOptimizer {
   bool replacePHIs();
 
   LiveRegOptimizer(Module *Mod) : Mod(Mod) {
+    DL = &Mod->getDataLayout();
     ConvertToScalar = Type::getInt32Ty(Mod->getContext());
   }
 };
@@ -182,17 +184,18 @@ bool AMDGPULateCodeGenPrepare::runOnFunction(Function &F) {
   AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
   UA = &getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
 
-  // "Optimize" the virtual regs that cross basic block boundaries. In such
-  // cases, vectors of illegal types will be scalarized and widened, with each
-  // scalar living in its own physical register. The optimization converts the
-  // vectors to equivalent vectors of legal type (which are convereted back
+  // "Optimize" the virtual regs that cross basic block boundaries. When
+  // building the SelectionDAG, vectors of illegal types that cross basic blocks
+  // will be scalarized and widened, with each scalar living in its
+  // own physical register. To work around this, this optimization converts the
+  // vectors to equivalent vectors of legal type (which are converted back
   // before uses in subsequent blocks), to pack the bits into fewer physical
   // registers (used in CopyToReg/CopyFromReg pairs).
   LiveRegOptimizer LRO(Mod);
 
   bool Changed = false;
   for (auto &BB : F)
-    for (Instruction &I : llvm::make_early_inc_range(BB)) {
+    for (Instruction &I : make_early_inc_range(BB)) {
       Changed |= visit(I);
       if (!LRO.shouldReplaceUses(I))
         continue;
@@ -212,65 +215,59 @@ bool LiveRegOptimizer::replaceUses(Instruction &I) {
   };
   DenseMap<BasicBlock *, ConvertUseInfo> InsertedConversionMap;
 
-  ConversionCandidateInfo FromCCI(
-      &I, I.getParent(),
-      static_cast<BasicBlock::iterator>(std::next(I.getIterator())));
+  ConversionCandidateInfo FromCCI(&I, I.getParent(),
+                                  std::next(I.getIterator()));
   FromCCI.setNewType(getCompatibleType(FromCCI.getLiveRegDef()));
   for (auto IUser = I.user_begin(); IUser != I.user_end(); IUser++) {
 
-    if (Instruction *UserInst = dyn_cast<Instruction>(*IUser)) {
-      if (UserInst->getParent() != I.getParent() || isa<PHINode>(UserInst)) {
-        LLVM_DEBUG(dbgs() << *UserInst << "\n\tUses "
-                          << *FromCCI.getOriginalType()
-                          << " from previous block. Needs conversion\n");
-        convertToOptType(FromCCI);
-        if (!FromCCI.hasConverted())
-          continue;
-        // If it is a PHI node, just create and collect the new operand. We can
-        // only replace the PHI node once we have converted all the operands
-        if (auto PhiInst = dyn_cast<PHINode>(UserInst)) {
-          for (unsigned Idx = 0; Idx < PhiInst->getNumIncomingValues(); Idx++) {
-            Value *IncVal = PhiInst->getIncomingValue(Idx);
-            if (&I == dyn_cast<Instruction>(IncVal)) {
-              BasicBlock *IncBlock = PhiInst->getIncomingBlock(Idx);
-              auto PHIOps = find_if(
-                  PHIUpdater,
-                  [&UserInst](
-                      std::pair<Instruction *,
-                                SmallVector<
-                                    std::pair<Instruction *, BasicBlock *>, 4>>
-                          &Entry) { return Entry.first == UserInst; });
-
-              if (PHIOps == PHIUpdater.end())
-                PHIUpdater.push_back(
-                    {UserInst, {{FromCCI.getConverted(), IncBlock}}});
-              else
-                PHIOps->second.push_back({FromCCI.getConverted(), IncBlock});
-
-              break;
-            }
+    Instruction *UserInst = cast<Instruction>(*IUser);
+    if (UserInst->getParent() != I.getParent() || isa<PHINode>(UserInst)) {
+      LLVM_DEBUG(dbgs() << *UserInst << "\n\tUses "
+                        << *FromCCI.getOriginalType()
+                        << " from previous block. Needs conversion\n");
+      convertToOptType(FromCCI);
+      if (!FromCCI.hasConverted())
+        continue;
+      // If it is a PHI node, just create and collect the new operand. We can
+      // only replace the PHI node once we have converted all the operands
+      if (auto PHI = dyn_cast<PHINode>(UserInst)) {
+        for (unsigned Idx = 0; Idx < PHI->getNumIncomingValues(); Idx++) {
+          Value *IncVal = PHI->getIncomingValue(Idx);
+          if (&I == dyn_cast<Instruction>(IncVal)) {
+            BasicBlock *IncBlock = PHI->getIncomingBlock(Idx);
+            auto PHIOps =
+                find_if(PHIUpdater, [&UserInst](PHIUpdateInfo &Entry) {
+                  return Entry.first == UserInst;
+                });
+
+            if (PHIOps == PHIUpdater.end())
+              PHIUpdater.push_back(
+                  {UserInst, {{FromCCI.getConverted(), IncBlock}}});
+            else
+              PHIOps->second.push_back({FromCCI.getConverted(), IncBlock});
+
+            break;
           }
-          continue;
-        }
-
-        // Do not create multiple conversion sequences if there are multiple
-        // uses in the same block
-        if (InsertedConversionMap.contains(UserInst->getParent())) {
-          InsertedConversionMap[UserInst->getParent()].Users.push_back(
-              UserInst);
-          LLVM_DEBUG(dbgs() << "\tUser already has access to converted def\n");
-          continue;
         }
+        continue;
+      }
 
-        ConversionCandidateInfo ToCCI(
-            FromCCI.getConverted(), I.getType(), UserInst->getParent(),
-            static_cast<BasicBlock::iterator>(
-                UserInst->getParent()->getFirstNonPHIIt()));
-        convertFromOptType(ToCCI);
-        assert(ToCCI.hasConverted());
-        InsertedConversionMap[UserInst->getParent()] = {ToCCI.getConverted(),
-                                                        {UserInst}};
+      // Do not create multiple conversion sequences if there are multiple
+      // uses in the same block
+      if (InsertedConversionMap.contains(UserInst->getParent())) {
+        InsertedConversionMap[UserInst->getParent()].Users.push_back(UserInst);
+        LLVM_DEBUG(dbgs() << "\tUser already has access to converted def\n");
+        continue;
       }
+
+      ConversionCandidateInfo ToCCI(FromCCI.getConverted(), I.getType(),
+                                    UserInst->getParent(),
+
+                                    UserInst->getParent()->getFirstNonPHIIt());
+      convertFromOptType(ToCCI);
+      assert(ToCCI.hasConverted());
+      InsertedConversionMap[UserInst->getParent()] = {ToCCI.getConverted(),
+                                                      {UserInst}};
     }
   }
 
@@ -279,7 +276,7 @@ bool LiveRegOptimizer::replaceUses(Instruction &I) {
   for (auto &Entry : InsertedConversionMap) {
     for (auto &UserInst : Entry.second.Users) {
       LLVM_DEBUG(dbgs() << *UserInst
-                        << "\n\tNow uses: " << *Entry.second.Converted << "\n");
+                        << "\n\tNow uses: " << *Entry.second.Converted << '\n');
       UserInst->replaceUsesOfWith(&I, Entry.second.Converted);
       MadeChange = true;
     }
@@ -290,29 +287,29 @@ bool LiveRegOptimizer::replaceUses(Instruction &I) {
 bool LiveRegOptimizer::replacePHIs() {
   bool MadeChange = false;
   for (auto Ele : PHIUpdater) {
-    auto ThePHINode = cast<PHINode>(Ele.first);
-    auto NewPHINodeOps = Ele.second;
-    LLVM_DEBUG(dbgs() << "Attempting to replace: " << *ThePHINode << "\n");
+    auto [ThePHIInst, NewPHINodeOps] = Ele;
+    LLVM_DEBUG(dbgs() << "Attempting to replace: " << *ThePHIInst << '\n');
     // If we have conveted all the required operands, then do the replacement
-    if (ThePHINode->getNumIncomingValues() == NewPHINodeOps.size()) {
+    if (cast<PHINode>(ThePHIInst)->getNumIncomingValues() ==
+        NewPHINodeOps.size()) {
       IRBuilder<> Builder(Ele.first);
       auto NPHI = Builder.CreatePHI(NewPHINodeOps[0].first->getType(),
                                     NewPHINodeOps.size());
       for (auto IncVals : NewPHINodeOps) {
         NPHI->addIncoming(IncVals.first, IncVals.second);
         LLVM_DEBUG(dbgs() << "  Using: " << *IncVals.first
-                          << "  For: " << IncVals.second->getName() << "\n");
+                          << "  For: " << IncVals.second->getName() << '\n');
       }
-      LLVM_DEBUG(dbgs() << "Sucessfully replaced with " << *NPHI << "\n");
+      LLVM_DEBUG(dbgs() << "Sucessfully replaced with " << *NPHI << '\n');
       ConversionCandidateInfo ToCCI(
-          NPHI, ThePHINode->getType(), ThePHINode->getParent(),
-          static_cast<BasicBlock::iterator>(
-              ThePHINode->getParent()->getFirstNonPHIIt()));
+          NPHI, ThePHIInst->getType(), ThePHIInst->getParent(),
+
+          ThePHIInst->getParent()->getFirstNonPHIIt());
       convertFromOptType(ToCCI);
       assert(ToCCI.hasConverted());
       Ele.first->replaceAllUsesWith(ToCCI.getConverted());
       // The old PHI is no longer used
-      ThePHINode->eraseFromParent();
+      ThePHIInst->eraseFromParent();
       MadeChange = true;
     }
   }
@@ -327,8 +324,8 @@ Type *LiveRegOptimizer::getCompatibleType(Instruction *InstToConvert) {
   if (!VTy)
     return ConvertToScalar;
 
-  unsigned OriginalSize = VTy->getPrimitiveSizeInBits();
-  unsigned ConvertScalarSize = ConvertToScalar->getScalarSizeInBits();
+  TypeSize OriginalSize = DL->getTypeSizeInBits(VTy);
+  TypeSize ConvertScalarSize = DL->getTypeSizeInBits(ConvertToScalar);
   unsigned ConvertEltCount =
       (OriginalSize + ConvertScalarSize - 1) / ConvertScalarSize;
 
@@ -336,7 +333,7 @@ Type *LiveRegOptimizer::getCompatibleType(Instruction *InstToConvert) {
     return IntegerType::get(Mod->getContext(), ConvertScalarSize);
 
   return VectorType::get(Type::getIntNTy(Mod->getContext(), ConvertScalarSize),
-                         llvm::ElementCount::getFixed(ConvertEltCount));
+                         ElementCount::getFixed(ConvertEltCount));
 }
 
 void LiveRegOptimizer::convertToOptType(ConversionCandidateInfo &LR) {
@@ -348,24 +345,24 @@ void LiveRegOptimizer::convertToOptType(ConversionCandidateInfo &LR) {
   VectorType *VTy = cast<VectorType>(LR.getOriginalType());
   Type *NewTy = LR.getNewType();
 
-  unsigned OriginalSize = VTy->getPrimitiveSizeInBits();
-  unsigned NewSize = NewTy->getPrimitiveSizeInBits();
+  TypeSize OriginalSize = DL->getTypeSizeInBits(VTy);
+  TypeSize NewSize = DL->getTypeSizeInBits(NewTy);
 
   auto &Builder = LR.getConvertBuilder();
-  Value *V = static_cast<Value *>(LR.getLiveRegDef());
+  Value *V = cast<Value>(LR.getLiveRegDef());
   // If there is a bitsize match, we can fit the old vector into a new vector of
   // desired type
   if (OriginalSize == NewSize) {
-    LR.setConverted(dyn_cast<Instruction>(Builder.CreateBitCast(V, NewTy)));
+    LR.setConverted(cast<Instruction>(Builder.CreateBitCast(V, NewTy)));
     LLVM_DEBUG(dbgs() << "\tConverted def to " << *LR.getConverted()->getType()
-                      << "\n");
+                      << '\n');
     return;
   }
 
   // If there is a bitsize mismatch, we must use a wider vector
   assert(NewSize > OriginalSize);
   ElementCount ExpandedVecElementCount =
-      llvm::ElementCount::getFixed(NewSize / VTy->getScalarSizeInBits());
+      ElementCount::getFixed(NewSize / VTy->getScalarSizeInBits());
 
   SmallVector<int, 8> ShuffleMask;
   for (unsigned I = 0; I < VTy->getElementCount().getFixedValue(); I++)
@@ -375,12 +372,11 @@ void LiveRegOptimizer::convertToOptType(ConversionCandidateInfo &LR) {
        I < ExpandedVecElementCount.getFixedValue(); I++)
     ShuffleMask.push_back(VTy->getElementCount().getFixedValue());
 
-  auto ExpandedVec =
-      dyn_cast<Instruction>(Builder.CreateShuffleVector(V, ShuffleMask));
-  LR.setConverted(
-      dyn_cast<Instruction>(Builder.CreateBitCast(ExpandedVec, NewTy)));
+  Instruction *ExpandedVec =
+      cast<Instruction>(Builder.CreateShuffleVector(V, ShuffleMask));
+  LR.setConverted(cast<Instruction>(Builder.CreateBitCast(ExpandedVec, NewTy)));
   LLVM_DEBUG(dbgs() << "\tConverted def to " << *LR.getConverted()->getType()
-                    << "\n");
+                    << '\n');
   return;
 }
 
@@ -388,48 +384,49 @@ void LiveRegOptimizer::convertFromOptType(ConversionCandidateInfo &LRC) {
   Type *OTy = LRC.getOriginalType();
   VectorType *NewVTy = cast<VectorType>(LRC.getNewType());
 
-  unsigned OriginalSize = OTy->getPrimitiveSizeInBits();
-  unsigned NewSize = NewVTy->getPrimitiveSizeInBits();
+  TypeSize OriginalSize = DL->getTypeSizeInBits(OTy);
+  TypeSize NewSize = DL->getTypeSizeInBits(NewVTy);
 
   auto &Builder = LRC.getConvertBuilder();
-  Value *V = static_cast<Value *>(LRC.getLiveRegDef());
+  Value *V = cast<Value>(LRC.getLiveRegDef());
   // If there is a bitsize match, we simply convert back to the original type
   if (OriginalSize == NewSize) {
-    LRC.setConverted(dyn_cast<Instruction>(Builder.CreateBitCast(V, NewVTy)));
+    LRC.setConverted(cast<Instruction>(Builder.CreateBitCast(V, NewVTy)));
     LLVM_DEBUG(dbgs() << "\tProduced for user: " << *LRC.getConverted()
-                      << "\n");
+                      << '\n');
     return;
   }
 
   if (!OTy->isVectorTy()) {
-    auto Trunc = dyn_cast<Instruction>(Builder.CreateTrunc(
+    Instruction *Trunc = cast<Instruction>(Builder.CreateTrunc(
         LRC.getLiveRegDef(), IntegerType::get(Mod->getContext(), NewSize)));
-    auto Original = dyn_cast<Instruction>(Builder.CreateBitCast(Trunc, NewVTy));
-    LRC.setConverted(dyn_cast<Instruction>(Original));
+    Instruction *Original =
+        cast<Instruction>(Builder.CreateBitCast(Trunc, NewVTy));
+    LRC.setConverted(cast<Instruction>(Original));
     LLVM_DEBUG(dbgs() << "\tProduced for user: " << *LRC.getConverted()
-                      << "\n");
+                      << '\n');
     return;
   }
 
   // If there is a bitsize mismatch, we have used a wider vector and must strip
   // the MSBs to convert back to the original type
   assert(OriginalSize > NewSize);
-  ElementCount ExpandedVecElementCount = llvm::ElementCount::getFixed(
-      OriginalSize / NewVTy->getScalarSizeInBits());
+  ElementCount ExpandedVecElementCount =
+      ElementCount::getFixed(OriginalSize / NewVTy->getScalarSizeInBits());
   VectorType *ExpandedVT = VectorType::get(
       Type::getIntNTy(Mod->getContext(), NewVTy->getScalarSizeInBits()),
       ExpandedVecElementCount);
-  Instruction *Converted = dyn_cast<Instruction>(
-      Builder.CreateBitCast(LRC.getLiveRegDef(), ExpandedVT));
+  Instruction *Converted =
+      cast<Instruction>(Builder.CreateBitCast(LRC.getLiveRegDef(), ExpandedVT));
 
   unsigned NarrowElementCount = NewVTy->getElementCount().getFixedValue();
   SmallVector<int, 8> ShuffleMask(NarrowElementCount);
   std::iota(ShuffleMask.begin(), ShuffleMask.end(), 0);
 
-  Instruction *NarrowVec = dyn_cast<Instruction>(
-      Builder.CreateShuffleVector(Converted, ShuffleMask));
-  LRC.setConverted(dyn_cast<Instruction>(NarrowVec));
-  LLVM_DEBUG(dbgs() << "\tProduced for user: " << *LRC.getConverted() << "\n");
+  Instruction *NarrowVec =
+      cast<Instruction>(Builder.CreateShuffleVector(Converted, ShuffleMask));
+  LRC.setConverted(cast<Instruction>(NarrowVec));
+  LLVM_DEBUG(dbgs() << "\tProduced for user: " << *LRC.getConverted() << '\n');
   return;
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
index 57179f8f26aec9..d08523f4d3cd00 100644
--- a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
+++ b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
@@ -350,5 +350,56 @@ bb.2:
   ret void
 }
 
-declare i32 @llvm.amdgcn.workitem.id.x()
+define amdgpu_kernel void @repeat_successor(i32 %in, ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
+; GFX906-LABEL: repeat_successor:
+; GFX906:       ; %bb.0: ; %entry
+; GFX906-NEXT:    s_load_dword s8, s[0:1], 0x24
+; GFX906-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
+; GFX906-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x3c
+; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX906-NEXT:    s_cmp_lt_i32 s8, 3
+; GFX906-NEXT:    s_cbranch_scc0 .LBB7_3
+; GFX906-NEXT:  ; %bb.1: ; %LeafBlock
+; GFX906-NEXT:    s_cmp_gt_i32 s8, 0
+; GFX906-NEXT:    s_cbranch_scc0 .LBB7_6
+; GFX906-NEXT:  ; %bb.2:
+; GFX906-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX906-NEXT:    global_load_dword v0, v0, s[4:5]
+; GFX906-NEXT:    s_branch .LBB7_5
+; GFX906-NEXT:  .LBB7_3: ; %LeafBlock5
+; GFX906-NEXT:    s_cmp_eq_u32 s8, 3
+; GFX906-NEXT:    s_cbranch_scc0 .LBB7_6
+; GFX906-NEXT:  ; %bb.4: ; %sw.bb5
+; GFX906-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX906-NEXT:    global_load_dword v0, v0, s[6:7]
+; GFX906-NEXT:  .LBB7_5: ; %return.sink.split
+; GFX906-NEXT:    v_mov_b32_e32 v1, 0
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    global_store_dword v1, v0, s[2:3]
+; GFX906-NEXT:  .LBB7_6: ; %return
+; GFX906-NEXT:    s_endpgm
+entry:
+  %idx = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
+  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
+  %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
+  %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
+  switch i32 %in, label %return [
+    i32 1, label %return.sink.split
+    i32 2, label %return.sink.split
+    i32 3, label %sw.bb5
+  ]
+
+sw.bb5:
+  br label %return.sink.split
+
+return.sink.split:
+  %tmp5 = phi <4 x i8> [ %vec2, %sw.bb5 ], [ %vec1, %entry ], [ %vec1, %entry ]
+  store <4 x i8> %tmp5, ptr addrspace(1) %dst, align 4
+  ret void
 
+return:
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x()



More information about the llvm-commits mailing list