[llvm] [AMDGPU] Add IR LiveReg type-based optimization (PR #66838)

Jeffrey Byrnes via llvm-commits llvm-commits at lists.llvm.org
Fri Jun 21 16:29:17 PDT 2024


https://github.com/jrbyrnes updated https://github.com/llvm/llvm-project/pull/66838

>From 5efb9556bc49b97e72789065cfed9013388c6f50 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Thu, 14 Sep 2023 12:20:06 -0700
Subject: [PATCH 01/17] [AMDGPU] Add IR LiveReg type-based optimization

Change-Id: Ide8a46cdaf1d2d82cbd5296c998a5c8fd41fce80
---
 .../Target/AMDGPU/AMDGPUCodeGenPrepare.cpp    |  343 +++
 .../amdgpu-codegenprepare-break-large-phis.ll |  125 +-
 .../test/CodeGen/AMDGPU/vni8-across-blocks.ll | 1880 ++---------------
 3 files changed, 546 insertions(+), 1802 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index 6e7d34f5adaa3..8f1dd1c522b04 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -107,6 +107,7 @@ class AMDGPUCodeGenPrepareImpl
   Module *Mod = nullptr;
   const DataLayout *DL = nullptr;
   bool HasUnsafeFPMath = false;
+  bool UsesGlobalISel = false;
   bool HasFP32DenormalFlush = false;
   bool FlowChanged = false;
   mutable Function *SqrtF32 = nullptr;
@@ -343,6 +344,85 @@ class AMDGPUCodeGenPrepare : public FunctionPass {
   StringRef getPassName() const override { return "AMDGPU IR optimizations"; }
 };
 
+class LiveRegConversion {
+private:
+  // The instruction which defined the original virtual register used across
+  // blocks
+  Instruction *LiveRegDef;
+  // The original type
+  Type *OriginalType;
+  // The desired type
+  Type *NewType;
+  // The instruction sequence that converts the virtual register, to be used
+  // instead of the original
+  std::optional<Instruction *> Converted;
+  // The builder used to build the conversion instruction
+  IRBuilder<> ConvertBuilder;
+
+public:
+  // The instruction which defined the original virtual register used across
+  // blocks
+  Instruction *getLiveRegDef() { return LiveRegDef; }
+  // The original type
+  Type *getOriginalType() { return OriginalType; }
+  // The desired type
+  Type *getNewType() { return NewType; }
+  void setNewType(Type *NewType) { this->NewType = NewType; }
+  // The instruction that conerts the virtual register, to be used instead of
+  // the original
+  std::optional<Instruction *> &getConverted() { return Converted; }
+  void setConverted(Instruction *Converted) { this->Converted = Converted; }
+  // The builder used to build the conversion instruction
+  IRBuilder<> &getConverBuilder() { return ConvertBuilder; }
+  // Do we have a instruction sequence which convert the original virtual
+  // register
+  bool hasConverted() { return Converted.has_value(); }
+
+  LiveRegConversion(Instruction *LiveRegDef, BasicBlock *InsertBlock,
+                    BasicBlock::iterator InsertPt)
+      : LiveRegDef(LiveRegDef), OriginalType(LiveRegDef->getType()),
+        ConvertBuilder(InsertBlock, InsertPt) {}
+  LiveRegConversion(Instruction *LiveRegDef, Type *NewType,
+                    BasicBlock *InsertBlock, BasicBlock::iterator InsertPt)
+      : LiveRegDef(LiveRegDef), OriginalType(LiveRegDef->getType()),
+        NewType(NewType), ConvertBuilder(InsertBlock, InsertPt) {}
+};
+
+class LiveRegOptimizer {
+private:
+  Module *Mod = nullptr;
+  // The scalar type to convert to
+  Type *ConvertToScalar;
+  // Holds the collection of PHIs with their pending new operands
+  SmallVector<std::pair<Instruction *,
+                        SmallVector<std::pair<Instruction *, BasicBlock *>, 4>>,
+              4>
+      PHIUpdater;
+
+public:
+  // Should the def of the instruction be converted if it is live across blocks
+  bool shouldReplaceUses(const Instruction &I);
+  // Convert the virtual register to the compatible vector of legal type
+  void convertToOptType(LiveRegConversion &LR);
+  // Convert the virtual register back to the original type, stripping away
+  // the MSBs in cases where there was an imperfect fit (e.g. v2i32 -> v7i8)
+  void convertFromOptType(LiveRegConversion &LR);
+  // Get a vector of desired scalar type that is compatible with the original
+  // vector. In cases where there is no bitsize equivalent using a legal vector
+  // type, we pad the MSBs (e.g. v7i8 -> v2i32)
+  Type *getCompatibleType(Instruction *InstToConvert);
+  // Find and replace uses of the virtual register in different block with a
+  // newly produced virtual register of legal type
+  bool replaceUses(Instruction &I);
+  // Replace the collected PHIs with newly produced incoming values. Replacement
+  // is only done if we have a replacement for each original incoming value.
+  bool replacePHIs();
+
+  LiveRegOptimizer(Module *Mod) : Mod(Mod) {
+    ConvertToScalar = Type::getInt32Ty(Mod->getContext());
+  }
+};
+
 } // end anonymous namespace
 
 bool AMDGPUCodeGenPrepareImpl::run(Function &F) {
@@ -360,6 +440,7 @@ bool AMDGPUCodeGenPrepareImpl::run(Function &F) {
       Next = std::next(I);
 
       MadeChange |= visit(*I);
+      I->getType();
 
       if (Next != E) { // Control flow changed
         BasicBlock *NextInstBB = Next->getParent();
@@ -371,9 +452,269 @@ bool AMDGPUCodeGenPrepareImpl::run(Function &F) {
       }
     }
   }
+
+  // GlobalISel should directly use the values, and do not need to emit
+  // CopyTo/CopyFrom Regs across blocks
+  if (UsesGlobalISel)
+    return MadeChange;
+
+  // "Optimize" the virtual regs that cross basic block boundaries. In such
+  // cases, vectors of illegal types will be scalarized and widened, with each
+  // scalar living in its own physical register. The optimization converts the
+  // vectors to equivalent vectors of legal type (which are convereted back
+  // before uses in subsequenmt blocks), to pack the bits into fewer physical
+  // registers (used in CopyToReg/CopyFromReg pairs).
+  LiveRegOptimizer LRO(Mod);
+  for (auto &BB : F) {
+    for (auto &I : BB) {
+      if (!LRO.shouldReplaceUses(I))
+        continue;
+      MadeChange |= LRO.replaceUses(I);
+    }
+  }
+
+  MadeChange |= LRO.replacePHIs();
+  return MadeChange;
+}
+
+bool LiveRegOptimizer::replaceUses(Instruction &I) {
+  bool MadeChange = false;
+
+  struct ConvertUseInfo {
+    Instruction *Converted;
+    SmallVector<Instruction *, 4> Users;
+  };
+  DenseMap<BasicBlock *, ConvertUseInfo> UseConvertTracker;
+
+  LiveRegConversion FromLRC(
+      &I, I.getParent(),
+      static_cast<BasicBlock::iterator>(std::next(I.getIterator())));
+  FromLRC.setNewType(getCompatibleType(FromLRC.getLiveRegDef()));
+  for (auto IUser = I.user_begin(); IUser != I.user_end(); IUser++) {
+
+    if (auto UserInst = dyn_cast<Instruction>(*IUser)) {
+      if (UserInst->getParent() != I.getParent()) {
+        LLVM_DEBUG(dbgs() << *UserInst << "\n\tUses "
+                          << *FromLRC.getOriginalType()
+                          << " from previous block. Needs conversion\n");
+        convertToOptType(FromLRC);
+        if (!FromLRC.hasConverted())
+          continue;
+        // If it is a PHI node, just create and collect the new operand. We can
+        // only replace the PHI node once we have converted all the operands
+        if (auto PhiInst = dyn_cast<PHINode>(UserInst)) {
+          for (unsigned Idx = 0; Idx < PhiInst->getNumIncomingValues(); Idx++) {
+            auto IncVal = PhiInst->getIncomingValue(Idx);
+            if (&I == dyn_cast<Instruction>(IncVal)) {
+              auto IncBlock = PhiInst->getIncomingBlock(Idx);
+              auto PHIOps = find_if(
+                  PHIUpdater,
+                  [&UserInst](
+                      std::pair<Instruction *,
+                                SmallVector<
+                                    std::pair<Instruction *, BasicBlock *>, 4>>
+                          &Entry) { return Entry.first == UserInst; });
+
+              if (PHIOps == PHIUpdater.end())
+                PHIUpdater.push_back(
+                    {UserInst, {{*FromLRC.getConverted(), IncBlock}}});
+              else
+                PHIOps->second.push_back({*FromLRC.getConverted(), IncBlock});
+
+              break;
+            }
+          }
+          continue;
+        }
+
+        // Do not create multiple conversion sequences if there are multiple
+        // uses in the same block
+        if (UseConvertTracker.contains(UserInst->getParent())) {
+          UseConvertTracker[UserInst->getParent()].Users.push_back(UserInst);
+          LLVM_DEBUG(dbgs() << "\tUser already has access to converted def\n");
+          continue;
+        }
+
+        LiveRegConversion ToLRC(*FromLRC.getConverted(), I.getType(),
+                                UserInst->getParent(),
+                                static_cast<BasicBlock::iterator>(
+                                    UserInst->getParent()->getFirstNonPHIIt()));
+        convertFromOptType(ToLRC);
+        assert(ToLRC.hasConverted());
+        UseConvertTracker[UserInst->getParent()] = {*ToLRC.getConverted(),
+                                                    {UserInst}};
+      }
+    }
+  }
+
+  // Replace uses of with in a separate loop that is not dependent upon the
+  // state of the uses
+  for (auto &Entry : UseConvertTracker) {
+    for (auto &UserInst : Entry.second.Users) {
+      LLVM_DEBUG(dbgs() << *UserInst
+                        << "\n\tNow uses: " << *Entry.second.Converted << "\n");
+      UserInst->replaceUsesOfWith(&I, Entry.second.Converted);
+      MadeChange = true;
+    }
+  }
+  return MadeChange;
+}
+
+bool LiveRegOptimizer::replacePHIs() {
+  bool MadeChange = false;
+  for (auto Ele : PHIUpdater) {
+    auto ThePHINode = dyn_cast<PHINode>(Ele.first);
+    assert(ThePHINode);
+    auto NewPHINodeOps = Ele.second;
+    LLVM_DEBUG(dbgs() << "Attempting to replace: " << *ThePHINode << "\n");
+    // If we have conveted all the required operands, then do the replacement
+    if (ThePHINode->getNumIncomingValues() == NewPHINodeOps.size()) {
+      IRBuilder<> Builder(Ele.first);
+      auto NPHI = Builder.CreatePHI(NewPHINodeOps[0].first->getType(),
+                                    NewPHINodeOps.size());
+      for (auto IncVals : NewPHINodeOps) {
+        NPHI->addIncoming(IncVals.first, IncVals.second);
+        LLVM_DEBUG(dbgs() << "  Using: " << *IncVals.first
+                          << "  For: " << IncVals.second->getName() << "\n");
+      }
+      LLVM_DEBUG(dbgs() << "Sucessfully replaced with " << *NPHI << "\n");
+      LiveRegConversion ToLRC(NPHI, ThePHINode->getType(),
+                              ThePHINode->getParent(),
+                              static_cast<BasicBlock::iterator>(
+                                  ThePHINode->getParent()->getFirstNonPHIIt()));
+      convertFromOptType(ToLRC);
+      assert(ToLRC.hasConverted());
+      Ele.first->replaceAllUsesWith(*ToLRC.getConverted());
+      // The old PHI is no longer used
+      ThePHINode->eraseFromParent();
+      MadeChange = true;
+    }
+  }
   return MadeChange;
 }
 
+Type *LiveRegOptimizer::getCompatibleType(Instruction *InstToConvert) {
+  auto OriginalType = InstToConvert->getType();
+  assert(OriginalType->getScalarSizeInBits() <=
+         ConvertToScalar->getScalarSizeInBits());
+  auto VTy = dyn_cast<VectorType>(OriginalType);
+  if (!VTy)
+    return ConvertToScalar;
+
+  auto OriginalSize =
+      VTy->getScalarSizeInBits() * VTy->getElementCount().getFixedValue();
+  auto ConvertScalarSize = ConvertToScalar->getScalarSizeInBits();
+  auto ConvertEltCount =
+      (OriginalSize + ConvertScalarSize - 1) / ConvertScalarSize;
+
+  return VectorType::get(Type::getIntNTy(Mod->getContext(), ConvertScalarSize),
+                         llvm::ElementCount::getFixed(ConvertEltCount));
+}
+
+void LiveRegOptimizer::convertToOptType(LiveRegConversion &LR) {
+  if (LR.hasConverted()) {
+    LLVM_DEBUG(dbgs() << "\tAlready has converted def\n");
+    return;
+  }
+
+  auto VTy = dyn_cast<VectorType>(LR.getOriginalType());
+  assert(VTy);
+  auto NewVTy = dyn_cast<VectorType>(LR.getNewType());
+  assert(NewVTy);
+
+  auto V = static_cast<Value *>(LR.getLiveRegDef());
+  auto OriginalSize =
+      VTy->getScalarSizeInBits() * VTy->getElementCount().getFixedValue();
+  auto NewSize =
+      NewVTy->getScalarSizeInBits() * NewVTy->getElementCount().getFixedValue();
+
+  auto &Builder = LR.getConverBuilder();
+
+  // If there is a bitsize match, we can fit the old vector into a new vector of
+  // desired type
+  if (OriginalSize == NewSize) {
+    LR.setConverted(dyn_cast<Instruction>(Builder.CreateBitCast(V, NewVTy)));
+    LLVM_DEBUG(dbgs() << "\tConverted def to "
+                      << *(*LR.getConverted())->getType() << "\n");
+    return;
+  }
+
+  // If there is a bitsize mismatch, we must use a wider vector
+  assert(NewSize > OriginalSize);
+  auto ExpandedVecElementCount =
+      llvm::ElementCount::getFixed(NewSize / VTy->getScalarSizeInBits());
+
+  SmallVector<int, 8> ShuffleMask;
+  for (unsigned I = 0; I < VTy->getElementCount().getFixedValue(); I++)
+    ShuffleMask.push_back(I);
+
+  for (uint64_t I = VTy->getElementCount().getFixedValue();
+       I < ExpandedVecElementCount.getFixedValue(); I++)
+    ShuffleMask.push_back(VTy->getElementCount().getFixedValue());
+
+  auto ExpandedVec =
+      dyn_cast<Instruction>(Builder.CreateShuffleVector(V, ShuffleMask));
+  LR.setConverted(
+      dyn_cast<Instruction>(Builder.CreateBitCast(ExpandedVec, NewVTy)));
+  LLVM_DEBUG(dbgs() << "\tConverted def to " << *(*LR.getConverted())->getType()
+                    << "\n");
+  return;
+}
+
+void LiveRegOptimizer::convertFromOptType(LiveRegConversion &LRC) {
+  auto VTy = dyn_cast<VectorType>(LRC.getOriginalType());
+  assert(VTy);
+  auto NewVTy = dyn_cast<VectorType>(LRC.getNewType());
+  assert(NewVTy);
+
+  auto V = static_cast<Value *>(LRC.getLiveRegDef());
+  auto OriginalSize =
+      VTy->getScalarSizeInBits() * VTy->getElementCount().getFixedValue();
+  auto NewSize =
+      NewVTy->getScalarSizeInBits() * NewVTy->getElementCount().getFixedValue();
+
+  auto &Builder = LRC.getConverBuilder();
+
+  // If there is a bitsize match, we simply convert back to the original type
+  if (OriginalSize == NewSize) {
+    LRC.setConverted(dyn_cast<Instruction>(Builder.CreateBitCast(V, NewVTy)));
+    LLVM_DEBUG(dbgs() << "\tProduced for user: " << **LRC.getConverted()
+                      << "\n");
+    return;
+  }
+
+  // If there is a bitsize mismatch, we have used a wider vector and must strip
+  // the MSBs to convert back to the original type
+  assert(OriginalSize > NewSize);
+  auto ExpandedVecElementCount = llvm::ElementCount::getFixed(
+      OriginalSize / NewVTy->getScalarSizeInBits());
+  auto ExpandedVT = VectorType::get(
+      Type::getIntNTy(Mod->getContext(), NewVTy->getScalarSizeInBits()),
+      ExpandedVecElementCount);
+  auto Converted = dyn_cast<Instruction>(
+      Builder.CreateBitCast(LRC.getLiveRegDef(), ExpandedVT));
+
+  auto NarrowElementCount = NewVTy->getElementCount().getFixedValue();
+  SmallVector<int, 8> ShuffleMask;
+  for (uint64_t I = 0; I < NarrowElementCount; I++)
+    ShuffleMask.push_back(I);
+
+  auto NarrowVec = dyn_cast<Instruction>(
+      Builder.CreateShuffleVector(Converted, ShuffleMask));
+  LRC.setConverted(dyn_cast<Instruction>(NarrowVec));
+  LLVM_DEBUG(dbgs() << "\tProduced for user: " << **LRC.getConverted() << "\n");
+  return;
+}
+
+bool LiveRegOptimizer::shouldReplaceUses(const Instruction &I) {
+  // Vectors of illegal types are copied across blocks in an efficient manner.
+  // They are scalarized and widened to legal scalars. In such cases, we can do
+  // better by using legal vector types
+  auto IType = I.getType();
+  return IType->isVectorTy() && IType->getScalarSizeInBits() < 16 &&
+         !I.getType()->getScalarType()->isPointerTy();
+}
+
 unsigned AMDGPUCodeGenPrepareImpl::getBaseElementBitWidth(const Type *T) const {
   assert(needsPromotionToI32(T) && "T does not need promotion to i32");
 
@@ -2275,6 +2616,7 @@ bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) {
   Impl.ST = &TM.getSubtarget<GCNSubtarget>(F);
   Impl.AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
   Impl.UA = &getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
+  Impl.UsesGlobalISel = TM.Options.EnableGlobalISel;
   auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
   Impl.DT = DTWP ? &DTWP->getDomTree() : nullptr;
   Impl.HasUnsafeFPMath = hasUnsafeFPMath(F);
@@ -2297,6 +2639,7 @@ PreservedAnalyses AMDGPUCodeGenPreparePass::run(Function &F,
   Impl.DT = FAM.getCachedResult<DominatorTreeAnalysis>(F);
   Impl.HasUnsafeFPMath = hasUnsafeFPMath(F);
   SIModeRegisterDefaults Mode(F, *Impl.ST);
+  Impl.UsesGlobalISel = TM.Options.EnableGlobalISel;
   Impl.HasFP32DenormalFlush =
       Mode.FP32Denormals == DenormalMode::getPreserveSign();
   PreservedAnalyses PA = PreservedAnalyses::none();
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-break-large-phis.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-break-large-phis.ll
index 93b9aeac3cd3f..f4871fa131442 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-break-large-phis.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-break-large-phis.ll
@@ -495,10 +495,15 @@ define amdgpu_kernel void @phi_v23i8(<23 x i8> %in, ptr %out, i1 %cond) {
 ; OPT:       then:
 ; OPT-NEXT:    [[X:%.*]] = insertelement <23 x i8> [[IN:%.*]], i8 42, i32 3
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE0:%.*]] = shufflevector <23 x i8> [[X]], <23 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; OPT-NEXT:    [[TMP0:%.*]] = bitcast <4 x i8> [[LARGEPHI_EXTRACTSLICE0]] to <1 x i32>
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE2:%.*]] = shufflevector <23 x i8> [[X]], <23 x i8> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; OPT-NEXT:    [[TMP1:%.*]] = bitcast <4 x i8> [[LARGEPHI_EXTRACTSLICE2]] to <1 x i32>
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE4:%.*]] = shufflevector <23 x i8> [[X]], <23 x i8> poison, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; OPT-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[LARGEPHI_EXTRACTSLICE4]] to <1 x i32>
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE6:%.*]] = shufflevector <23 x i8> [[X]], <23 x i8> poison, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; OPT-NEXT:    [[TMP3:%.*]] = bitcast <4 x i8> [[LARGEPHI_EXTRACTSLICE6]] to <1 x i32>
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE8:%.*]] = shufflevector <23 x i8> [[X]], <23 x i8> poison, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; OPT-NEXT:    [[TMP4:%.*]] = bitcast <4 x i8> [[LARGEPHI_EXTRACTSLICE8]] to <1 x i32>
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE10:%.*]] = extractelement <23 x i8> [[X]], i64 20
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE12:%.*]] = extractelement <23 x i8> [[X]], i64 21
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE14:%.*]] = extractelement <23 x i8> [[X]], i64 22
@@ -506,31 +511,41 @@ define amdgpu_kernel void @phi_v23i8(<23 x i8> %in, ptr %out, i1 %cond) {
 ; OPT:       else:
 ; OPT-NEXT:    [[Y:%.*]] = insertelement <23 x i8> [[IN]], i8 64, i32 6
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE1:%.*]] = shufflevector <23 x i8> [[Y]], <23 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; OPT-NEXT:    [[TMP5:%.*]] = bitcast <4 x i8> [[LARGEPHI_EXTRACTSLICE1]] to <1 x i32>
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE3:%.*]] = shufflevector <23 x i8> [[Y]], <23 x i8> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; OPT-NEXT:    [[TMP6:%.*]] = bitcast <4 x i8> [[LARGEPHI_EXTRACTSLICE3]] to <1 x i32>
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE5:%.*]] = shufflevector <23 x i8> [[Y]], <23 x i8> poison, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; OPT-NEXT:    [[TMP7:%.*]] = bitcast <4 x i8> [[LARGEPHI_EXTRACTSLICE5]] to <1 x i32>
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE7:%.*]] = shufflevector <23 x i8> [[Y]], <23 x i8> poison, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; OPT-NEXT:    [[TMP8:%.*]] = bitcast <4 x i8> [[LARGEPHI_EXTRACTSLICE7]] to <1 x i32>
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE9:%.*]] = shufflevector <23 x i8> [[Y]], <23 x i8> poison, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; OPT-NEXT:    [[TMP9:%.*]] = bitcast <4 x i8> [[LARGEPHI_EXTRACTSLICE9]] to <1 x i32>
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE11:%.*]] = extractelement <23 x i8> [[Y]], i64 20
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE13:%.*]] = extractelement <23 x i8> [[Y]], i64 21
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE15:%.*]] = extractelement <23 x i8> [[Y]], i64 22
 ; OPT-NEXT:    br label [[FINALLY]]
 ; OPT:       finally:
-; OPT-NEXT:    [[TMP0:%.*]] = phi <4 x i8> [ [[LARGEPHI_EXTRACTSLICE0]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE1]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP1:%.*]] = phi <4 x i8> [ [[LARGEPHI_EXTRACTSLICE2]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE3]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP2:%.*]] = phi <4 x i8> [ [[LARGEPHI_EXTRACTSLICE4]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE5]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP3:%.*]] = phi <4 x i8> [ [[LARGEPHI_EXTRACTSLICE6]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE7]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP4:%.*]] = phi <4 x i8> [ [[LARGEPHI_EXTRACTSLICE8]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE9]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP5:%.*]] = phi i8 [ [[LARGEPHI_EXTRACTSLICE10]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE11]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP6:%.*]] = phi i8 [ [[LARGEPHI_EXTRACTSLICE12]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE13]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP7:%.*]] = phi i8 [ [[LARGEPHI_EXTRACTSLICE14]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE15]], [[ELSE]] ]
-; OPT-NEXT:    [[LARGEPHI_INSERTSLICE0:%.*]] = call <23 x i8> @llvm.vector.insert.v23i8.v4i8(<23 x i8> poison, <4 x i8> [[TMP0]], i64 0)
-; OPT-NEXT:    [[LARGEPHI_INSERTSLICE1:%.*]] = call <23 x i8> @llvm.vector.insert.v23i8.v4i8(<23 x i8> [[LARGEPHI_INSERTSLICE0]], <4 x i8> [[TMP1]], i64 4)
-; OPT-NEXT:    [[LARGEPHI_INSERTSLICE2:%.*]] = call <23 x i8> @llvm.vector.insert.v23i8.v4i8(<23 x i8> [[LARGEPHI_INSERTSLICE1]], <4 x i8> [[TMP2]], i64 8)
-; OPT-NEXT:    [[LARGEPHI_INSERTSLICE3:%.*]] = call <23 x i8> @llvm.vector.insert.v23i8.v4i8(<23 x i8> [[LARGEPHI_INSERTSLICE2]], <4 x i8> [[TMP3]], i64 12)
-; OPT-NEXT:    [[LARGEPHI_INSERTSLICE4:%.*]] = call <23 x i8> @llvm.vector.insert.v23i8.v4i8(<23 x i8> [[LARGEPHI_INSERTSLICE3]], <4 x i8> [[TMP4]], i64 16)
-; OPT-NEXT:    [[LARGEPHI_INSERTSLICE5:%.*]] = insertelement <23 x i8> [[LARGEPHI_INSERTSLICE4]], i8 [[TMP5]], i64 20
-; OPT-NEXT:    [[LARGEPHI_INSERTSLICE6:%.*]] = insertelement <23 x i8> [[LARGEPHI_INSERTSLICE5]], i8 [[TMP6]], i64 21
-; OPT-NEXT:    [[LARGEPHI_INSERTSLICE7:%.*]] = insertelement <23 x i8> [[LARGEPHI_INSERTSLICE6]], i8 [[TMP7]], i64 22
+; OPT-NEXT:    [[TMP10:%.*]] = phi <1 x i32> [ [[TMP0]], [[THEN]] ], [ [[TMP5]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP11:%.*]] = phi <1 x i32> [ [[TMP1]], [[THEN]] ], [ [[TMP6]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP12:%.*]] = phi <1 x i32> [ [[TMP2]], [[THEN]] ], [ [[TMP7]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP13:%.*]] = phi <1 x i32> [ [[TMP3]], [[THEN]] ], [ [[TMP8]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP14:%.*]] = phi <1 x i32> [ [[TMP4]], [[THEN]] ], [ [[TMP9]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP15:%.*]] = phi i8 [ [[LARGEPHI_EXTRACTSLICE10]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE11]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP16:%.*]] = phi i8 [ [[LARGEPHI_EXTRACTSLICE12]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE13]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP17:%.*]] = phi i8 [ [[LARGEPHI_EXTRACTSLICE14]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE15]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP18:%.*]] = bitcast <1 x i32> [[TMP14]] to <4 x i8>
+; OPT-NEXT:    [[TMP19:%.*]] = bitcast <1 x i32> [[TMP13]] to <4 x i8>
+; OPT-NEXT:    [[TMP20:%.*]] = bitcast <1 x i32> [[TMP12]] to <4 x i8>
+; OPT-NEXT:    [[TMP21:%.*]] = bitcast <1 x i32> [[TMP11]] to <4 x i8>
+; OPT-NEXT:    [[TMP22:%.*]] = bitcast <1 x i32> [[TMP10]] to <4 x i8>
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE0:%.*]] = call <23 x i8> @llvm.vector.insert.v23i8.v4i8(<23 x i8> poison, <4 x i8> [[TMP22]], i64 0)
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE1:%.*]] = call <23 x i8> @llvm.vector.insert.v23i8.v4i8(<23 x i8> [[LARGEPHI_INSERTSLICE0]], <4 x i8> [[TMP21]], i64 4)
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE2:%.*]] = call <23 x i8> @llvm.vector.insert.v23i8.v4i8(<23 x i8> [[LARGEPHI_INSERTSLICE1]], <4 x i8> [[TMP20]], i64 8)
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE3:%.*]] = call <23 x i8> @llvm.vector.insert.v23i8.v4i8(<23 x i8> [[LARGEPHI_INSERTSLICE2]], <4 x i8> [[TMP19]], i64 12)
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE4:%.*]] = call <23 x i8> @llvm.vector.insert.v23i8.v4i8(<23 x i8> [[LARGEPHI_INSERTSLICE3]], <4 x i8> [[TMP18]], i64 16)
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE5:%.*]] = insertelement <23 x i8> [[LARGEPHI_INSERTSLICE4]], i8 [[TMP15]], i64 20
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE6:%.*]] = insertelement <23 x i8> [[LARGEPHI_INSERTSLICE5]], i8 [[TMP16]], i64 21
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE7:%.*]] = insertelement <23 x i8> [[LARGEPHI_INSERTSLICE6]], i8 [[TMP17]], i64 22
 ; OPT-NEXT:    store <23 x i8> [[LARGEPHI_INSERTSLICE7]], ptr [[OUT:%.*]], align 1
 ; OPT-NEXT:    ret void
 ;
@@ -539,13 +554,19 @@ define amdgpu_kernel void @phi_v23i8(<23 x i8> %in, ptr %out, i1 %cond) {
 ; NOOPT-NEXT:    br i1 [[COND:%.*]], label [[THEN:%.*]], label [[ELSE:%.*]]
 ; NOOPT:       then:
 ; NOOPT-NEXT:    [[X:%.*]] = insertelement <23 x i8> [[IN:%.*]], i8 42, i32 3
+; NOOPT-NEXT:    [[TMP0:%.*]] = shufflevector <23 x i8> [[X]], <23 x i8> poison, <24 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; NOOPT-NEXT:    [[TMP1:%.*]] = bitcast <24 x i8> [[TMP0]] to <6 x i32>
 ; NOOPT-NEXT:    br label [[FINALLY:%.*]]
 ; NOOPT:       else:
 ; NOOPT-NEXT:    [[Y:%.*]] = insertelement <23 x i8> [[IN]], i8 64, i32 6
+; NOOPT-NEXT:    [[TMP2:%.*]] = shufflevector <23 x i8> [[Y]], <23 x i8> poison, <24 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; NOOPT-NEXT:    [[TMP3:%.*]] = bitcast <24 x i8> [[TMP2]] to <6 x i32>
 ; NOOPT-NEXT:    br label [[FINALLY]]
 ; NOOPT:       finally:
-; NOOPT-NEXT:    [[VAL:%.*]] = phi <23 x i8> [ [[X]], [[THEN]] ], [ [[Y]], [[ELSE]] ]
-; NOOPT-NEXT:    store <23 x i8> [[VAL]], ptr [[OUT:%.*]], align 1
+; NOOPT-NEXT:    [[TMP4:%.*]] = phi <6 x i32> [ [[TMP1]], [[THEN]] ], [ [[TMP3]], [[ELSE]] ]
+; NOOPT-NEXT:    [[TMP5:%.*]] = bitcast <6 x i32> [[TMP4]] to <24 x i8>
+; NOOPT-NEXT:    [[TMP6:%.*]] = shufflevector <24 x i8> [[TMP5]], <24 x i8> poison, <23 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22>
+; NOOPT-NEXT:    store <23 x i8> [[TMP6]], ptr [[OUT:%.*]], align 1
 ; NOOPT-NEXT:    ret void
 ;
 entry:
@@ -572,31 +593,36 @@ define amdgpu_kernel void @phi_v23i8_zeroinit(<23 x i8> %in, ptr %out, i1 %cond)
 ; OPT:       else:
 ; OPT-NEXT:    [[Y:%.*]] = insertelement <23 x i8> [[IN:%.*]], i8 64, i32 6
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE1:%.*]] = shufflevector <23 x i8> [[Y]], <23 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; OPT-NEXT:    [[TMP0:%.*]] = bitcast <4 x i8> [[LARGEPHI_EXTRACTSLICE1]] to <1 x i32>
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE3:%.*]] = shufflevector <23 x i8> [[Y]], <23 x i8> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; OPT-NEXT:    [[TMP1:%.*]] = bitcast <4 x i8> [[LARGEPHI_EXTRACTSLICE3]] to <1 x i32>
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE5:%.*]] = shufflevector <23 x i8> [[Y]], <23 x i8> poison, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; OPT-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[LARGEPHI_EXTRACTSLICE5]] to <1 x i32>
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE7:%.*]] = shufflevector <23 x i8> [[Y]], <23 x i8> poison, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; OPT-NEXT:    [[TMP3:%.*]] = bitcast <4 x i8> [[LARGEPHI_EXTRACTSLICE7]] to <1 x i32>
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE9:%.*]] = shufflevector <23 x i8> [[Y]], <23 x i8> poison, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; OPT-NEXT:    [[TMP4:%.*]] = bitcast <4 x i8> [[LARGEPHI_EXTRACTSLICE9]] to <1 x i32>
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE11:%.*]] = extractelement <23 x i8> [[Y]], i64 20
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE13:%.*]] = extractelement <23 x i8> [[Y]], i64 21
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE15:%.*]] = extractelement <23 x i8> [[Y]], i64 22
 ; OPT-NEXT:    br label [[FINALLY]]
 ; OPT:       finally:
-; OPT-NEXT:    [[TMP0:%.*]] = phi <4 x i8> [ zeroinitializer, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE1]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP1:%.*]] = phi <4 x i8> [ zeroinitializer, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE3]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP2:%.*]] = phi <4 x i8> [ zeroinitializer, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE5]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP3:%.*]] = phi <4 x i8> [ zeroinitializer, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE7]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP4:%.*]] = phi <4 x i8> [ zeroinitializer, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE9]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP5:%.*]] = phi i8 [ 0, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE11]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP6:%.*]] = phi i8 [ 0, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE13]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP7:%.*]] = phi i8 [ 0, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE15]], [[ELSE]] ]
-; OPT-NEXT:    [[LARGEPHI_INSERTSLICE0:%.*]] = call <23 x i8> @llvm.vector.insert.v23i8.v4i8(<23 x i8> poison, <4 x i8> [[TMP0]], i64 0)
-; OPT-NEXT:    [[LARGEPHI_INSERTSLICE1:%.*]] = call <23 x i8> @llvm.vector.insert.v23i8.v4i8(<23 x i8> [[LARGEPHI_INSERTSLICE0]], <4 x i8> [[TMP1]], i64 4)
-; OPT-NEXT:    [[LARGEPHI_INSERTSLICE2:%.*]] = call <23 x i8> @llvm.vector.insert.v23i8.v4i8(<23 x i8> [[LARGEPHI_INSERTSLICE1]], <4 x i8> [[TMP2]], i64 8)
-; OPT-NEXT:    [[LARGEPHI_INSERTSLICE3:%.*]] = call <23 x i8> @llvm.vector.insert.v23i8.v4i8(<23 x i8> [[LARGEPHI_INSERTSLICE2]], <4 x i8> [[TMP3]], i64 12)
-; OPT-NEXT:    [[LARGEPHI_INSERTSLICE4:%.*]] = call <23 x i8> @llvm.vector.insert.v23i8.v4i8(<23 x i8> [[LARGEPHI_INSERTSLICE3]], <4 x i8> [[TMP4]], i64 16)
-; OPT-NEXT:    [[LARGEPHI_INSERTSLICE5:%.*]] = insertelement <23 x i8> [[LARGEPHI_INSERTSLICE4]], i8 [[TMP5]], i64 20
-; OPT-NEXT:    [[LARGEPHI_INSERTSLICE6:%.*]] = insertelement <23 x i8> [[LARGEPHI_INSERTSLICE5]], i8 [[TMP6]], i64 21
-; OPT-NEXT:    [[LARGEPHI_INSERTSLICE7:%.*]] = insertelement <23 x i8> [[LARGEPHI_INSERTSLICE6]], i8 [[TMP7]], i64 22
+; OPT-NEXT:    [[TMP5:%.*]] = phi <4 x i8> [ zeroinitializer, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE1]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP6:%.*]] = phi <4 x i8> [ zeroinitializer, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE3]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP7:%.*]] = phi <4 x i8> [ zeroinitializer, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE5]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP8:%.*]] = phi <4 x i8> [ zeroinitializer, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE7]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP9:%.*]] = phi <4 x i8> [ zeroinitializer, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE9]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP10:%.*]] = phi i8 [ 0, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE11]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP11:%.*]] = phi i8 [ 0, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE13]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP12:%.*]] = phi i8 [ 0, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE15]], [[ELSE]] ]
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE0:%.*]] = call <23 x i8> @llvm.vector.insert.v23i8.v4i8(<23 x i8> poison, <4 x i8> [[TMP5]], i64 0)
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE1:%.*]] = call <23 x i8> @llvm.vector.insert.v23i8.v4i8(<23 x i8> [[LARGEPHI_INSERTSLICE0]], <4 x i8> [[TMP6]], i64 4)
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE2:%.*]] = call <23 x i8> @llvm.vector.insert.v23i8.v4i8(<23 x i8> [[LARGEPHI_INSERTSLICE1]], <4 x i8> [[TMP7]], i64 8)
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE3:%.*]] = call <23 x i8> @llvm.vector.insert.v23i8.v4i8(<23 x i8> [[LARGEPHI_INSERTSLICE2]], <4 x i8> [[TMP8]], i64 12)
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE4:%.*]] = call <23 x i8> @llvm.vector.insert.v23i8.v4i8(<23 x i8> [[LARGEPHI_INSERTSLICE3]], <4 x i8> [[TMP9]], i64 16)
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE5:%.*]] = insertelement <23 x i8> [[LARGEPHI_INSERTSLICE4]], i8 [[TMP10]], i64 20
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE6:%.*]] = insertelement <23 x i8> [[LARGEPHI_INSERTSLICE5]], i8 [[TMP11]], i64 21
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE7:%.*]] = insertelement <23 x i8> [[LARGEPHI_INSERTSLICE6]], i8 [[TMP12]], i64 22
 ; OPT-NEXT:    store <23 x i8> [[LARGEPHI_INSERTSLICE7]], ptr [[OUT:%.*]], align 1
 ; OPT-NEXT:    ret void
 ;
@@ -607,6 +633,8 @@ define amdgpu_kernel void @phi_v23i8_zeroinit(<23 x i8> %in, ptr %out, i1 %cond)
 ; NOOPT-NEXT:    br label [[FINALLY:%.*]]
 ; NOOPT:       else:
 ; NOOPT-NEXT:    [[Y:%.*]] = insertelement <23 x i8> [[IN:%.*]], i8 64, i32 6
+; NOOPT-NEXT:    [[TMP0:%.*]] = shufflevector <23 x i8> [[Y]], <23 x i8> poison, <24 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; NOOPT-NEXT:    [[TMP1:%.*]] = bitcast <24 x i8> [[TMP0]] to <6 x i32>
 ; NOOPT-NEXT:    br label [[FINALLY]]
 ; NOOPT:       finally:
 ; NOOPT-NEXT:    [[VAL:%.*]] = phi <23 x i8> [ zeroinitializer, [[THEN]] ], [ [[Y]], [[ELSE]] ]
@@ -635,25 +663,28 @@ define amdgpu_kernel void @phi_v15i8_random_constant_init(<15 x i8> %in, ptr %ou
 ; OPT:       else:
 ; OPT-NEXT:    [[Y:%.*]] = insertelement <15 x i8> [[IN:%.*]], i8 64, i32 6
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE1:%.*]] = shufflevector <15 x i8> [[Y]], <15 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; OPT-NEXT:    [[TMP0:%.*]] = bitcast <4 x i8> [[LARGEPHI_EXTRACTSLICE1]] to <1 x i32>
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE3:%.*]] = shufflevector <15 x i8> [[Y]], <15 x i8> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; OPT-NEXT:    [[TMP1:%.*]] = bitcast <4 x i8> [[LARGEPHI_EXTRACTSLICE3]] to <1 x i32>
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE5:%.*]] = shufflevector <15 x i8> [[Y]], <15 x i8> poison, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; OPT-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[LARGEPHI_EXTRACTSLICE5]] to <1 x i32>
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE7:%.*]] = extractelement <15 x i8> [[Y]], i64 12
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE9:%.*]] = extractelement <15 x i8> [[Y]], i64 13
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE11:%.*]] = extractelement <15 x i8> [[Y]], i64 14
 ; OPT-NEXT:    br label [[FINALLY]]
 ; OPT:       finally:
-; OPT-NEXT:    [[TMP0:%.*]] = phi <4 x i8> [ <i8 poison, i8 1, i8 2, i8 3>, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE1]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP1:%.*]] = phi <4 x i8> [ <i8 4, i8 undef, i8 6, i8 7>, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE3]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP2:%.*]] = phi <4 x i8> [ <i8 9, i8 10, i8 11, i8 12>, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE5]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP3:%.*]] = phi i8 [ 13, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE7]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP4:%.*]] = phi i8 [ 14, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE9]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP5:%.*]] = phi i8 [ undef, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE11]], [[ELSE]] ]
-; OPT-NEXT:    [[LARGEPHI_INSERTSLICE0:%.*]] = call <15 x i8> @llvm.vector.insert.v15i8.v4i8(<15 x i8> poison, <4 x i8> [[TMP0]], i64 0)
-; OPT-NEXT:    [[LARGEPHI_INSERTSLICE1:%.*]] = call <15 x i8> @llvm.vector.insert.v15i8.v4i8(<15 x i8> [[LARGEPHI_INSERTSLICE0]], <4 x i8> [[TMP1]], i64 4)
-; OPT-NEXT:    [[LARGEPHI_INSERTSLICE2:%.*]] = call <15 x i8> @llvm.vector.insert.v15i8.v4i8(<15 x i8> [[LARGEPHI_INSERTSLICE1]], <4 x i8> [[TMP2]], i64 8)
-; OPT-NEXT:    [[LARGEPHI_INSERTSLICE3:%.*]] = insertelement <15 x i8> [[LARGEPHI_INSERTSLICE2]], i8 [[TMP3]], i64 12
-; OPT-NEXT:    [[LARGEPHI_INSERTSLICE4:%.*]] = insertelement <15 x i8> [[LARGEPHI_INSERTSLICE3]], i8 [[TMP4]], i64 13
-; OPT-NEXT:    [[LARGEPHI_INSERTSLICE5:%.*]] = insertelement <15 x i8> [[LARGEPHI_INSERTSLICE4]], i8 [[TMP5]], i64 14
+; OPT-NEXT:    [[TMP3:%.*]] = phi <4 x i8> [ <i8 poison, i8 1, i8 2, i8 3>, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE1]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP4:%.*]] = phi <4 x i8> [ <i8 4, i8 undef, i8 6, i8 7>, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE3]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP5:%.*]] = phi <4 x i8> [ <i8 9, i8 10, i8 11, i8 12>, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE5]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP6:%.*]] = phi i8 [ 13, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE7]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP7:%.*]] = phi i8 [ 14, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE9]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP8:%.*]] = phi i8 [ undef, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE11]], [[ELSE]] ]
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE0:%.*]] = call <15 x i8> @llvm.vector.insert.v15i8.v4i8(<15 x i8> poison, <4 x i8> [[TMP3]], i64 0)
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE1:%.*]] = call <15 x i8> @llvm.vector.insert.v15i8.v4i8(<15 x i8> [[LARGEPHI_INSERTSLICE0]], <4 x i8> [[TMP4]], i64 4)
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE2:%.*]] = call <15 x i8> @llvm.vector.insert.v15i8.v4i8(<15 x i8> [[LARGEPHI_INSERTSLICE1]], <4 x i8> [[TMP5]], i64 8)
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE3:%.*]] = insertelement <15 x i8> [[LARGEPHI_INSERTSLICE2]], i8 [[TMP6]], i64 12
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE4:%.*]] = insertelement <15 x i8> [[LARGEPHI_INSERTSLICE3]], i8 [[TMP7]], i64 13
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE5:%.*]] = insertelement <15 x i8> [[LARGEPHI_INSERTSLICE4]], i8 [[TMP8]], i64 14
 ; OPT-NEXT:    store <15 x i8> [[LARGEPHI_INSERTSLICE5]], ptr [[OUT:%.*]], align 1
 ; OPT-NEXT:    ret void
 ;
@@ -664,6 +695,8 @@ define amdgpu_kernel void @phi_v15i8_random_constant_init(<15 x i8> %in, ptr %ou
 ; NOOPT-NEXT:    br label [[FINALLY:%.*]]
 ; NOOPT:       else:
 ; NOOPT-NEXT:    [[Y:%.*]] = insertelement <15 x i8> [[IN:%.*]], i8 64, i32 6
+; NOOPT-NEXT:    [[TMP0:%.*]] = shufflevector <15 x i8> [[Y]], <15 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; NOOPT-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
 ; NOOPT-NEXT:    br label [[FINALLY]]
 ; NOOPT:       finally:
 ; NOOPT-NEXT:    [[VAL:%.*]] = phi <15 x i8> [ <i8 poison, i8 1, i8 2, i8 3, i8 4, i8 undef, i8 6, i8 7, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 undef>, [[THEN]] ], [ [[Y]], [[ELSE]] ]
diff --git a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
index f78b408d78255..57179f8f26aec 100644
--- a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
+++ b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
@@ -6,27 +6,31 @@ define amdgpu_kernel void @v3i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1)
 ; GFX906:       ; %bb.0: ; %entry
 ; GFX906-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX906-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 2, v0
+; GFX906-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; GFX906-NEXT:    v_mov_b32_e32 v3, 8
 ; GFX906-NEXT:    v_mov_b32_e32 v1, 0
-; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v0
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX906-NEXT:    global_load_dword v2, v5, s[4:5]
+; GFX906-NEXT:    global_load_dword v4, v2, s[4:5]
+; GFX906-NEXT:    s_mov_b32 s4, 0xff0000
+; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v0
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 8, v2
+; GFX906-NEXT:    v_lshrrev_b32_sdwa v5, v3, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v5, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX906-NEXT:    v_and_or_b32 v4, v4, s4, v5
 ; GFX906-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX906-NEXT:    s_cbranch_execz .LBB0_2
 ; GFX906-NEXT:  ; %bb.1: ; %bb.1
-; GFX906-NEXT:    global_load_dword v2, v5, s[6:7]
+; GFX906-NEXT:    global_load_dword v0, v2, s[6:7]
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 8, v2
+; GFX906-NEXT:    v_lshrrev_b32_sdwa v2, v3, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX906-NEXT:    v_and_or_b32 v4, v0, s4, v2
 ; GFX906-NEXT:  .LBB0_2: ; %bb.2
 ; GFX906-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v4
-; GFX906-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    global_store_byte v1, v3, s[2:3] offset:2
-; GFX906-NEXT:    global_store_short v1, v0, s[2:3]
+; GFX906-NEXT:    global_store_byte_d16_hi v1, v4, s[2:3] offset:2
+; GFX906-NEXT:    global_store_short v1, v4, s[2:3]
 ; GFX906-NEXT:    s_endpgm
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -50,31 +54,19 @@ define amdgpu_kernel void @v4i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1)
 ; GFX906:       ; %bb.0: ; %entry
 ; GFX906-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX906-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX906-NEXT:    v_lshlrev_b32_e32 v6, 2, v0
+; GFX906-NEXT:    v_lshlrev_b32_e32 v3, 2, v0
 ; GFX906-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v0
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX906-NEXT:    global_load_dword v2, v6, s[4:5]
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v2
-; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
-; GFX906-NEXT:    v_lshrrev_b32_e32 v5, 8, v2
+; GFX906-NEXT:    global_load_dword v2, v3, s[4:5]
 ; GFX906-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX906-NEXT:    s_cbranch_execz .LBB1_2
 ; GFX906-NEXT:  ; %bb.1: ; %bb.1
-; GFX906-NEXT:    global_load_dword v2, v6, s[6:7]
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v2
-; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
-; GFX906-NEXT:    v_lshrrev_b32_e32 v5, 8, v2
+; GFX906-NEXT:    global_load_dword v2, v3, s[6:7]
 ; GFX906-NEXT:  .LBB1_2: ; %bb.2
 ; GFX906-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v5
-; GFX906-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v3
-; GFX906-NEXT:    v_or_b32_sdwa v2, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    global_store_dword v1, v0, s[2:3]
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    global_store_dword v1, v2, s[2:3]
 ; GFX906-NEXT:    s_endpgm
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -98,32 +90,23 @@ define amdgpu_kernel void @v5i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1)
 ; GFX906:       ; %bb.0: ; %entry
 ; GFX906-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX906-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX906-NEXT:    v_lshlrev_b32_e32 v7, 3, v0
-; GFX906-NEXT:    v_mov_b32_e32 v5, 0
+; GFX906-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
+; GFX906-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v0
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX906-NEXT:    global_load_dwordx2 v[1:2], v7, s[4:5]
+; GFX906-NEXT:    global_load_dwordx2 v[1:2], v4, s[4:5]
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshrrev_b64 v[3:4], 24, v[1:2]
-; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
-; GFX906-NEXT:    v_lshrrev_b32_e32 v6, 8, v1
+; GFX906-NEXT:    v_and_b32_e32 v2, 0xff, v2
 ; GFX906-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX906-NEXT:    s_cbranch_execz .LBB2_2
 ; GFX906-NEXT:  ; %bb.1: ; %bb.1
-; GFX906-NEXT:    global_load_dwordx2 v[1:2], v7, s[6:7]
+; GFX906-NEXT:    global_load_dwordx2 v[1:2], v4, s[6:7]
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshrrev_b64 v[3:4], 24, v[1:2]
-; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
-; GFX906-NEXT:    v_lshrrev_b32_e32 v6, 8, v1
+; GFX906-NEXT:    v_and_b32_e32 v2, 0xff, v2
 ; GFX906-NEXT:  .LBB2_2: ; %bb.2
 ; GFX906-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v6
-; GFX906-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v3
-; GFX906-NEXT:    v_or_b32_sdwa v1, v4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    global_store_byte v5, v2, s[2:3] offset:4
-; GFX906-NEXT:    global_store_dword v5, v0, s[2:3]
+; GFX906-NEXT:    global_store_byte v3, v2, s[2:3] offset:4
+; GFX906-NEXT:    global_store_dword v3, v1, s[2:3]
 ; GFX906-NEXT:    s_endpgm
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -147,42 +130,19 @@ define amdgpu_kernel void @v8i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1)
 ; GFX906:       ; %bb.0: ; %entry
 ; GFX906-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX906-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX906-NEXT:    v_lshlrev_b32_e32 v10, 3, v0
+; GFX906-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
 ; GFX906-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v0
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX906-NEXT:    global_load_dwordx2 v[1:2], v10, s[4:5]
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 24, v2
-; GFX906-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
-; GFX906-NEXT:    v_lshrrev_b32_e32 v6, 8, v2
-; GFX906-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
-; GFX906-NEXT:    v_lshrrev_b32_e32 v8, 16, v1
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v1
+; GFX906-NEXT:    global_load_dwordx2 v[1:2], v4, s[4:5]
 ; GFX906-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX906-NEXT:    s_cbranch_execz .LBB3_2
 ; GFX906-NEXT:  ; %bb.1: ; %bb.1
-; GFX906-NEXT:    global_load_dwordx2 v[1:2], v10, s[6:7]
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 24, v2
-; GFX906-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
-; GFX906-NEXT:    v_lshrrev_b32_e32 v6, 8, v2
-; GFX906-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
-; GFX906-NEXT:    v_lshrrev_b32_e32 v8, 16, v1
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v1
+; GFX906-NEXT:    global_load_dwordx2 v[1:2], v4, s[6:7]
 ; GFX906-NEXT:  .LBB3_2: ; %bb.2
 ; GFX906-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v9
-; GFX906-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v7
-; GFX906-NEXT:    v_or_b32_sdwa v1, v8, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v6
-; GFX906-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v4
-; GFX906-NEXT:    v_or_b32_sdwa v2, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    global_store_dwordx2 v3, v[0:1], s[2:3]
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    global_store_dwordx2 v3, v[1:2], s[2:3]
 ; GFX906-NEXT:    s_endpgm
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -206,64 +166,19 @@ define amdgpu_kernel void @v16i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1
 ; GFX906:       ; %bb.0: ; %entry
 ; GFX906-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX906-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX906-NEXT:    v_lshlrev_b32_e32 v18, 4, v0
+; GFX906-NEXT:    v_lshlrev_b32_e32 v6, 4, v0
 ; GFX906-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v0
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX906-NEXT:    global_load_dwordx4 v[1:4], v18, s[4:5]
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v6, 24, v4
-; GFX906-NEXT:    v_lshrrev_b32_e32 v7, 16, v4
-; GFX906-NEXT:    v_lshrrev_b32_e32 v8, 8, v4
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v3
-; GFX906-NEXT:    v_lshrrev_b32_e32 v10, 16, v3
-; GFX906-NEXT:    v_lshrrev_b32_e32 v11, 8, v3
-; GFX906-NEXT:    v_lshrrev_b32_e32 v12, 24, v2
-; GFX906-NEXT:    v_lshrrev_b32_e32 v13, 16, v2
-; GFX906-NEXT:    v_lshrrev_b32_e32 v14, 8, v2
-; GFX906-NEXT:    v_lshrrev_b32_e32 v15, 24, v1
-; GFX906-NEXT:    v_lshrrev_b32_e32 v16, 16, v1
-; GFX906-NEXT:    v_lshrrev_b32_e32 v17, 8, v1
+; GFX906-NEXT:    global_load_dwordx4 v[1:4], v6, s[4:5]
 ; GFX906-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX906-NEXT:    s_cbranch_execz .LBB4_2
 ; GFX906-NEXT:  ; %bb.1: ; %bb.1
-; GFX906-NEXT:    global_load_dwordx4 v[1:4], v18, s[6:7]
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v6, 24, v4
-; GFX906-NEXT:    v_lshrrev_b32_e32 v7, 16, v4
-; GFX906-NEXT:    v_lshrrev_b32_e32 v8, 8, v4
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v3
-; GFX906-NEXT:    v_lshrrev_b32_e32 v10, 16, v3
-; GFX906-NEXT:    v_lshrrev_b32_e32 v11, 8, v3
-; GFX906-NEXT:    v_lshrrev_b32_e32 v12, 24, v2
-; GFX906-NEXT:    v_lshrrev_b32_e32 v13, 16, v2
-; GFX906-NEXT:    v_lshrrev_b32_e32 v14, 8, v2
-; GFX906-NEXT:    v_lshrrev_b32_e32 v15, 24, v1
-; GFX906-NEXT:    v_lshrrev_b32_e32 v16, 16, v1
-; GFX906-NEXT:    v_lshrrev_b32_e32 v17, 8, v1
+; GFX906-NEXT:    global_load_dwordx4 v[1:4], v6, s[6:7]
 ; GFX906-NEXT:  .LBB4_2: ; %bb.2
 ; GFX906-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v17
-; GFX906-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v15
-; GFX906-NEXT:    v_or_b32_sdwa v1, v16, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v14
-; GFX906-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v12
-; GFX906-NEXT:    v_or_b32_sdwa v2, v13, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v11
-; GFX906-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v9
-; GFX906-NEXT:    v_or_b32_sdwa v3, v10, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v8
-; GFX906-NEXT:    v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_lshlrev_b16_e32 v4, 8, v6
-; GFX906-NEXT:    v_or_b32_sdwa v4, v7, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    global_store_dwordx4 v5, v[0:3], s[2:3]
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    global_store_dwordx4 v5, v[1:4], s[2:3]
 ; GFX906-NEXT:    s_endpgm
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -286,114 +201,24 @@ define amdgpu_kernel void @v32i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1
 ; GFX906-LABEL: v32i8_liveout:
 ; GFX906:       ; %bb.0: ; %entry
 ; GFX906-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX906-NEXT:    v_lshlrev_b32_e32 v31, 5, v0
-; GFX906-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX906-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX906-NEXT:    v_lshlrev_b32_e32 v10, 5, v0
 ; GFX906-NEXT:    v_mov_b32_e32 v9, 0
 ; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v0
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX906-NEXT:    global_load_dwordx4 v[1:4], v31, s[4:5] offset:16
-; GFX906-NEXT:    global_load_dwordx4 v[5:8], v31, s[4:5]
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v4
-; GFX906-NEXT:    v_lshrrev_b32_e32 v10, 16, v4
-; GFX906-NEXT:    v_lshrrev_b32_e32 v11, 8, v4
-; GFX906-NEXT:    v_lshrrev_b32_e32 v12, 24, v3
-; GFX906-NEXT:    v_lshrrev_b32_e32 v13, 16, v3
-; GFX906-NEXT:    v_lshrrev_b32_e32 v14, 8, v3
-; GFX906-NEXT:    v_lshrrev_b32_e32 v15, 24, v2
-; GFX906-NEXT:    v_lshrrev_b32_e32 v16, 16, v2
-; GFX906-NEXT:    v_lshrrev_b32_e32 v17, 8, v2
-; GFX906-NEXT:    v_lshrrev_b32_e32 v18, 24, v1
-; GFX906-NEXT:    v_lshrrev_b32_e32 v19, 16, v1
-; GFX906-NEXT:    v_lshrrev_b32_e32 v20, 8, v1
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v21, 24, v8
-; GFX906-NEXT:    v_lshrrev_b32_e32 v22, 16, v8
-; GFX906-NEXT:    v_lshrrev_b32_e32 v23, 8, v8
-; GFX906-NEXT:    v_lshrrev_b32_e32 v24, 24, v7
-; GFX906-NEXT:    v_lshrrev_b32_e32 v25, 16, v7
-; GFX906-NEXT:    v_lshrrev_b32_e32 v26, 8, v7
-; GFX906-NEXT:    v_lshrrev_b32_e32 v27, 24, v6
-; GFX906-NEXT:    v_lshrrev_b32_e32 v28, 16, v6
-; GFX906-NEXT:    v_lshrrev_b32_e32 v29, 8, v6
-; GFX906-NEXT:    v_lshrrev_b32_e32 v30, 24, v5
-; GFX906-NEXT:    v_lshrrev_b32_e32 v32, 16, v5
-; GFX906-NEXT:    v_lshrrev_b32_e32 v33, 8, v5
-; GFX906-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX906-NEXT:    global_load_dwordx4 v[1:4], v10, s[4:5] offset:16
+; GFX906-NEXT:    global_load_dwordx4 v[5:8], v10, s[4:5]
+; GFX906-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX906-NEXT:    s_cbranch_execz .LBB5_2
 ; GFX906-NEXT:  ; %bb.1: ; %bb.1
-; GFX906-NEXT:    global_load_dwordx4 v[1:4], v31, s[6:7] offset:16
-; GFX906-NEXT:    global_load_dwordx4 v[5:8], v31, s[6:7]
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v4
-; GFX906-NEXT:    v_lshrrev_b32_e32 v10, 16, v4
-; GFX906-NEXT:    v_lshrrev_b32_e32 v11, 8, v4
-; GFX906-NEXT:    v_lshrrev_b32_e32 v12, 24, v3
-; GFX906-NEXT:    v_lshrrev_b32_e32 v13, 16, v3
-; GFX906-NEXT:    v_lshrrev_b32_e32 v14, 8, v3
-; GFX906-NEXT:    v_lshrrev_b32_e32 v15, 24, v2
-; GFX906-NEXT:    v_lshrrev_b32_e32 v16, 16, v2
-; GFX906-NEXT:    v_lshrrev_b32_e32 v17, 8, v2
-; GFX906-NEXT:    v_lshrrev_b32_e32 v18, 24, v1
-; GFX906-NEXT:    v_lshrrev_b32_e32 v19, 16, v1
-; GFX906-NEXT:    v_lshrrev_b32_e32 v20, 8, v1
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v21, 24, v8
-; GFX906-NEXT:    v_lshrrev_b32_e32 v22, 16, v8
-; GFX906-NEXT:    v_lshrrev_b32_e32 v23, 8, v8
-; GFX906-NEXT:    v_lshrrev_b32_e32 v24, 24, v7
-; GFX906-NEXT:    v_lshrrev_b32_e32 v25, 16, v7
-; GFX906-NEXT:    v_lshrrev_b32_e32 v26, 8, v7
-; GFX906-NEXT:    v_lshrrev_b32_e32 v27, 24, v6
-; GFX906-NEXT:    v_lshrrev_b32_e32 v28, 16, v6
-; GFX906-NEXT:    v_lshrrev_b32_e32 v29, 8, v6
-; GFX906-NEXT:    v_lshrrev_b32_e32 v30, 24, v5
-; GFX906-NEXT:    v_lshrrev_b32_e32 v32, 16, v5
-; GFX906-NEXT:    v_lshrrev_b32_e32 v33, 8, v5
+; GFX906-NEXT:    global_load_dwordx4 v[1:4], v10, s[6:7] offset:16
+; GFX906-NEXT:    global_load_dwordx4 v[5:8], v10, s[6:7]
 ; GFX906-NEXT:  .LBB5_2: ; %bb.2
-; GFX906-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX906-NEXT:    v_lshlrev_b16_e32 v30, 8, v30
-; GFX906-NEXT:    v_lshlrev_b16_e32 v31, 8, v33
-; GFX906-NEXT:    v_lshlrev_b16_e32 v29, 8, v29
-; GFX906-NEXT:    v_lshlrev_b16_e32 v27, 8, v27
-; GFX906-NEXT:    v_lshlrev_b16_e32 v26, 8, v26
-; GFX906-NEXT:    v_lshlrev_b16_e32 v24, 8, v24
-; GFX906-NEXT:    v_lshlrev_b16_e32 v23, 8, v23
-; GFX906-NEXT:    v_lshlrev_b16_e32 v21, 8, v21
-; GFX906-NEXT:    v_or_b32_sdwa v30, v32, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v5, v5, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v6, v6, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v27, v28, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v7, v7, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v24, v25, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v8, v8, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v21, v22, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v5, v5, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v6, v6, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v7, v7, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v8, v8, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    global_store_dwordx4 v9, v[5:8], s[0:1]
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX906-NEXT:    v_lshlrev_b16_e32 v5, 8, v20
-; GFX906-NEXT:    v_or_b32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_lshlrev_b16_e32 v5, 8, v18
-; GFX906-NEXT:    v_or_b32_sdwa v5, v19, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    v_lshlrev_b16_e32 v5, 8, v17
-; GFX906-NEXT:    v_or_b32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_lshlrev_b16_e32 v5, 8, v15
-; GFX906-NEXT:    v_or_b32_sdwa v5, v16, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    v_lshlrev_b16_e32 v5, 8, v14
-; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_lshlrev_b16_e32 v5, 8, v12
-; GFX906-NEXT:    v_or_b32_sdwa v5, v13, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    v_lshlrev_b16_e32 v5, 8, v11
-; GFX906-NEXT:    v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v0, v10, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v4, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    global_store_dwordx4 v9, v[1:4], s[0:1] offset:16
+; GFX906-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    global_store_dwordx4 v9, v[1:4], s[2:3] offset:16
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    global_store_dwordx4 v9, v[5:8], s[2:3]
 ; GFX906-NEXT:    s_endpgm
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -415,1555 +240,98 @@ bb.2:
 define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
 ; GFX906-LABEL: v256i8_liveout:
 ; GFX906:       ; %bb.0: ; %entry
+; GFX906-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX906-NEXT:    v_lshlrev_b32_e32 v61, 3, v0
 ; GFX906-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
 ; GFX906-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
 ; GFX906-NEXT:    s_mov_b32 s10, -1
+; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX906-NEXT:    global_load_dwordx4 v[5:8], v61, s[4:5] offset:240
 ; GFX906-NEXT:    s_mov_b32 s11, 0xe00000
 ; GFX906-NEXT:    s_add_u32 s8, s8, s3
-; GFX906-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX906-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX906-NEXT:    v_lshlrev_b32_e32 v63, 3, v0
 ; GFX906-NEXT:    s_addc_u32 s9, s9, 0
-; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX906-NEXT:    global_load_dwordx4 v[17:20], v63, s[4:5] offset:240
-; GFX906-NEXT:    global_load_dwordx4 v[5:8], v63, s[4:5] offset:224
-; GFX906-NEXT:    global_load_dwordx4 v[9:12], v63, s[4:5] offset:208
-; GFX906-NEXT:    global_load_dwordx4 v[13:16], v63, s[4:5] offset:192
 ; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v0
+; GFX906-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX906-NEXT:    v_mov_b32_e32 v4, 0
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v20
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:16 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v20
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:20 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v20
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:24 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v19
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:28 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v19
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:32 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v19
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:36 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v18
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:40 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v18
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:44 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v18
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:48 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v17
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:52 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v17
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:56 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v17, off, s[8:11], 0 ; 4-byte Folded Spill
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    buffer_store_dword v18, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v19, off, s[8:11], 0 offset:8 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v20, off, s[8:11], 0 offset:12 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v17
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:60 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v8
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:64 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v8
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:68 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v8
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:72 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v7
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:76 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v7
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:80 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v7
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:84 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v6
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:88 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v6
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:92 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v6
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:96 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v5
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:100 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v5
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:104 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v5
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:108 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v12
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:112 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v12
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:116 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v12
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:120 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v11
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:124 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v11
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:128 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v11
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:132 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v10
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:136 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v10
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:140 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v10
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:144 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v9
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:148 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v9
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:152 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v9
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:156 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v16
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:160 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v16
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:164 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v16
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:168 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v15
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:176 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v15
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:180 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v15
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:172 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v14
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:188 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v14
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:192 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v14
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:184 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v13
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:200 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v13
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:204 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v13
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:196 ; 4-byte Folded Spill
-; GFX906-NEXT:    global_load_dwordx4 v[17:20], v63, s[4:5] offset:176
-; GFX906-NEXT:    global_load_dwordx4 v[21:24], v63, s[4:5] offset:160
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v20
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:208 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v20
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:212 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v20
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:224 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v19
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:216 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v19
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:220 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v19
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:236 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v18
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:228 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v18
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:232 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v18
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:248 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v17
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:240 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v17
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:244 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v17
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:252 ; 4-byte Folded Spill
-; GFX906-NEXT:    s_waitcnt vmcnt(12)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v24
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:256 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v24
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:260 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v24
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:272 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v23
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:264 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v23
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:268 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v23
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:284 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v22
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:276 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v22
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:280 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v22
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:296 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v21
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:288 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v21
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:292 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v21
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:300 ; 4-byte Folded Spill
-; GFX906-NEXT:    global_load_dwordx4 v[25:28], v63, s[4:5] offset:144
-; GFX906-NEXT:    global_load_dwordx4 v[29:32], v63, s[4:5] offset:128
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v28
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:304 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v28
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:308 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v28
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:320 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v27
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:312 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v27
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:316 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v27
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:332 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v26
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:324 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v26
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:328 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v26
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:344 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v25
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:336 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v25
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:340 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v25
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:348 ; 4-byte Folded Spill
-; GFX906-NEXT:    s_waitcnt vmcnt(12)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v32
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:352 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v32
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:356 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v32
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:368 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v31
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:360 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v31
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:364 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v31
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:380 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v30
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:372 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v30
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:376 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v30
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:392 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v29
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:384 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v29
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:388 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v29
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:396 ; 4-byte Folded Spill
-; GFX906-NEXT:    global_load_dwordx4 v[33:36], v63, s[4:5] offset:112
-; GFX906-NEXT:    global_load_dwordx4 v[37:40], v63, s[4:5] offset:96
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v36
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:400 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v36
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:404 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v36
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:416 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v35
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:408 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v35
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:412 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v35
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:428 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v34
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:420 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v34
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:424 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v34
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:440 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v33
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:432 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v33
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:436 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v33
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:444 ; 4-byte Folded Spill
-; GFX906-NEXT:    s_waitcnt vmcnt(12)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v40
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:448 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v40
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:452 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v40
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:464 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v39
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:456 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v39
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:460 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v39
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:476 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v38
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:468 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v38
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:472 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v38
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:488 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v37
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:480 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v37
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:484 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v37
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:492 ; 4-byte Folded Spill
-; GFX906-NEXT:    global_load_dwordx4 v[41:44], v63, s[4:5] offset:80
-; GFX906-NEXT:    global_load_dwordx4 v[45:48], v63, s[4:5] offset:64
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v44
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:496 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v44
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:500 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v44
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:512 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v43
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:504 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v43
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:508 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v43
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:524 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v42
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:516 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v42
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:520 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v42
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:536 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v41
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:528 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v41
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:532 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v41
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:540 ; 4-byte Folded Spill
-; GFX906-NEXT:    s_waitcnt vmcnt(12)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v48
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:544 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v48
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:548 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v48
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:560 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v47
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:552 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v47
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:556 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v47
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:572 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v46
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:564 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v46
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:568 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v46
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:584 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v45
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:576 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v45
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:580 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v45
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:588 ; 4-byte Folded Spill
-; GFX906-NEXT:    global_load_dwordx4 v[49:52], v63, s[4:5] offset:48
-; GFX906-NEXT:    global_load_dwordx4 v[53:56], v63, s[4:5] offset:32
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v52
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:592 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v52
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:596 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v52
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:608 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v51
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:600 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v51
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:604 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v51
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:620 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v50
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:612 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v50
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:616 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v50
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:632 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v49
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:624 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v49
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:628 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v49
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:636 ; 4-byte Folded Spill
-; GFX906-NEXT:    s_waitcnt vmcnt(12)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v56
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:640 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v56
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:644 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v56
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:656 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v55
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:648 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v55
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:652 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v55
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:668 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v54
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:660 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v54
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:664 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v54
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:680 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v53
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:672 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v53
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:676 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v53
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:684 ; 4-byte Folded Spill
-; GFX906-NEXT:    global_load_dwordx4 v[57:60], v63, s[4:5] offset:16
+; GFX906-NEXT:    buffer_store_dword v5, off, s[8:11], 0 ; 4-byte Folded Spill
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    buffer_store_dword v6, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v7, off, s[8:11], 0 offset:8 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v8, off, s[8:11], 0 offset:12 ; 4-byte Folded Spill
+; GFX906-NEXT:    global_load_dwordx4 v[5:8], v61, s[4:5] offset:224
 ; GFX906-NEXT:    s_nop 0
-; GFX906-NEXT:    global_load_dwordx4 v[0:3], v63, s[4:5]
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 24, v60
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:688 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 16, v60
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:692 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 8, v60
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:704 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 24, v59
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:696 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 16, v59
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:700 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 8, v59
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:716 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 24, v58
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:708 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 16, v58
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:712 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 8, v58
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:728 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 24, v57
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:720 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 16, v57
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:724 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 8, v57
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:732 ; 4-byte Folded Spill
-; GFX906-NEXT:    s_waitcnt vmcnt(12)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 24, v3
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:736 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 16, v3
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:740 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 8, v3
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:752 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 24, v2
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:744 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 16, v2
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:748 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 8, v2
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:764 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 24, v1
-; GFX906-NEXT:    v_lshrrev_b32_e32 v62, 24, v0
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:756 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 16, v1
-; GFX906-NEXT:    buffer_store_dword v62, off, s[8:11], 0 offset:768 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v62, 16, v0
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:760 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 8, v1
-; GFX906-NEXT:    buffer_store_dword v62, off, s[8:11], 0 offset:772 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v62, 8, v0
-; GFX906-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX906-NEXT:    global_load_dwordx4 v[9:12], v61, s[4:5] offset:208
+; GFX906-NEXT:    global_load_dwordx4 v[13:16], v61, s[4:5] offset:192
+; GFX906-NEXT:    global_load_dwordx4 v[17:20], v61, s[4:5] offset:176
+; GFX906-NEXT:    global_load_dwordx4 v[21:24], v61, s[4:5] offset:160
+; GFX906-NEXT:    global_load_dwordx4 v[25:28], v61, s[4:5] offset:144
+; GFX906-NEXT:    global_load_dwordx4 v[29:32], v61, s[4:5] offset:128
+; GFX906-NEXT:    global_load_dwordx4 v[33:36], v61, s[4:5] offset:112
+; GFX906-NEXT:    global_load_dwordx4 v[37:40], v61, s[4:5] offset:96
+; GFX906-NEXT:    global_load_dwordx4 v[41:44], v61, s[4:5] offset:80
+; GFX906-NEXT:    global_load_dwordx4 v[45:48], v61, s[4:5] offset:64
+; GFX906-NEXT:    global_load_dwordx4 v[49:52], v61, s[4:5] offset:48
+; GFX906-NEXT:    global_load_dwordx4 v[53:56], v61, s[4:5] offset:32
+; GFX906-NEXT:    global_load_dwordx4 v[57:60], v61, s[4:5] offset:16
+; GFX906-NEXT:    global_load_dwordx4 v[0:3], v61, s[4:5]
+; GFX906-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX906-NEXT:    s_cbranch_execz .LBB6_2
 ; GFX906-NEXT:  ; %bb.1: ; %bb.1
-; GFX906-NEXT:    global_load_dwordx4 v[0:3], v63, s[6:7] offset:240
-; GFX906-NEXT:    global_load_dwordx4 v[5:8], v63, s[6:7] offset:224
-; GFX906-NEXT:    global_load_dwordx4 v[9:12], v63, s[6:7] offset:208
-; GFX906-NEXT:    global_load_dwordx4 v[13:16], v63, s[6:7] offset:192
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v17, 24, v3
-; GFX906-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:16 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v17, 16, v3
-; GFX906-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:20 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v17, 8, v3
-; GFX906-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:24 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v17, 24, v2
-; GFX906-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:28 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v17, 16, v2
-; GFX906-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:32 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v17, 8, v2
-; GFX906-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:36 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v17, 24, v1
-; GFX906-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:40 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v17, 16, v1
-; GFX906-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:44 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v17, 8, v1
-; GFX906-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:48 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v17, 24, v0
-; GFX906-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:52 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v17, 16, v0
-; GFX906-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:56 ; 4-byte Folded Spill
+; GFX906-NEXT:    global_load_dwordx4 v[0:3], v61, s[6:7] offset:240
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 ; 4-byte Folded Spill
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    buffer_store_dword v1, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill
 ; GFX906-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:8 ; 4-byte Folded Spill
 ; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:12 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:60 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v8
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:64 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v8
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:68 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v8
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:72 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v7
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:76 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v7
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:80 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v7
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:84 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v6
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:88 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v6
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:92 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v6
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:96 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v5
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:100 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v5
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:104 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v5
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:108 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v12
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:112 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v12
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:116 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v12
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:120 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v11
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:124 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v11
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:128 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v11
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:132 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v10
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:136 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v10
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:140 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v10
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:144 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v9
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:148 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v9
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:152 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v9
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:156 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v16
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:160 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v16
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:164 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v16
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:168 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v15
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:176 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v15
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:180 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v15
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:172 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v14
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:188 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v14
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:192 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v14
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:184 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v13
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:200 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v13
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:204 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v13
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:196 ; 4-byte Folded Spill
-; GFX906-NEXT:    global_load_dwordx4 v[17:20], v63, s[6:7] offset:176
-; GFX906-NEXT:    global_load_dwordx4 v[21:24], v63, s[6:7] offset:160
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v20
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:208 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v20
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:212 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v20
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:224 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v19
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:216 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v19
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:220 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v19
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:236 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v18
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:228 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v18
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:232 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v18
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:248 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v17
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:240 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v17
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:244 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v17
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:252 ; 4-byte Folded Spill
-; GFX906-NEXT:    s_waitcnt vmcnt(12)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v24
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:256 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v24
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:260 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v24
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:272 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v23
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:264 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v23
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:268 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v23
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:284 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v22
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:276 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v22
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:280 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v22
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:296 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v21
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:288 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v21
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:292 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v21
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:300 ; 4-byte Folded Spill
-; GFX906-NEXT:    global_load_dwordx4 v[25:28], v63, s[6:7] offset:144
-; GFX906-NEXT:    global_load_dwordx4 v[29:32], v63, s[6:7] offset:128
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v28
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:304 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v28
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:308 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v28
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:320 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v27
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:312 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v27
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:316 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v27
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:332 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v26
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:324 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v26
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:328 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v26
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:344 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v25
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:336 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v25
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:340 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v25
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:348 ; 4-byte Folded Spill
-; GFX906-NEXT:    s_waitcnt vmcnt(12)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v32
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:352 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v32
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:356 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v32
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:368 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v31
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:360 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v31
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:364 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v31
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:380 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v30
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:372 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v30
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:376 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v30
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:392 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v29
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:384 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v29
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:388 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v29
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:396 ; 4-byte Folded Spill
-; GFX906-NEXT:    global_load_dwordx4 v[33:36], v63, s[6:7] offset:112
-; GFX906-NEXT:    global_load_dwordx4 v[37:40], v63, s[6:7] offset:96
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v36
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:400 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v36
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:404 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v36
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:416 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v35
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:408 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v35
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:412 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v35
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:428 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v34
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:420 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v34
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:424 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v34
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:440 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v33
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:432 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v33
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:436 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v33
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:444 ; 4-byte Folded Spill
-; GFX906-NEXT:    s_waitcnt vmcnt(12)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v40
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:448 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v40
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:452 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v40
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:464 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v39
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:456 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v39
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:460 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v39
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:476 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v38
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:468 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v38
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:472 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v38
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:488 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v37
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:480 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v37
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:484 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v37
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:492 ; 4-byte Folded Spill
-; GFX906-NEXT:    global_load_dwordx4 v[41:44], v63, s[6:7] offset:80
-; GFX906-NEXT:    global_load_dwordx4 v[45:48], v63, s[6:7] offset:64
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v44
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:496 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v44
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:500 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v44
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:512 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v43
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:504 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v43
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:508 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v43
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:524 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v42
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:516 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v42
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:520 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v42
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:536 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v41
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:528 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v41
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:532 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v41
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:540 ; 4-byte Folded Spill
-; GFX906-NEXT:    s_waitcnt vmcnt(12)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v48
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:544 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v48
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:548 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v48
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:560 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v47
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:552 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v47
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:556 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v47
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:572 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v46
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:564 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v46
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:568 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v46
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:584 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v45
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:576 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v45
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:580 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v45
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:588 ; 4-byte Folded Spill
-; GFX906-NEXT:    global_load_dwordx4 v[49:52], v63, s[6:7] offset:48
-; GFX906-NEXT:    global_load_dwordx4 v[53:56], v63, s[6:7] offset:32
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v52
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:592 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v52
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:596 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v52
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:608 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v51
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:600 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v51
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:604 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v51
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:620 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v50
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:612 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v50
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:616 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v50
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:632 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v49
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:624 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v49
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:628 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v49
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:636 ; 4-byte Folded Spill
-; GFX906-NEXT:    s_waitcnt vmcnt(12)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v56
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:640 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v56
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:644 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v56
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:656 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v55
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:648 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v55
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:652 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v55
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:668 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v54
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:660 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v54
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:664 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v54
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:680 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v53
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:672 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v53
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:676 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v53
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:684 ; 4-byte Folded Spill
-; GFX906-NEXT:    global_load_dwordx4 v[57:60], v63, s[6:7] offset:16
-; GFX906-NEXT:    s_nop 0
-; GFX906-NEXT:    global_load_dwordx4 v[0:3], v63, s[6:7]
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 24, v60
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:688 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 16, v60
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:692 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 8, v60
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:704 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 24, v59
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:696 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 16, v59
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:700 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 8, v59
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:716 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 24, v58
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:708 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 16, v58
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:712 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 8, v58
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:728 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 24, v57
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:720 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 16, v57
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:724 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 8, v57
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:732 ; 4-byte Folded Spill
-; GFX906-NEXT:    s_waitcnt vmcnt(12)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 24, v3
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:736 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 16, v3
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:740 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 8, v3
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:752 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 24, v2
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:744 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 16, v2
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:748 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 8, v2
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:764 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 24, v1
-; GFX906-NEXT:    v_lshrrev_b32_e32 v62, 24, v0
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:756 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 16, v1
-; GFX906-NEXT:    buffer_store_dword v62, off, s[8:11], 0 offset:768 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v62, 16, v0
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:760 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 8, v1
-; GFX906-NEXT:    buffer_store_dword v62, off, s[8:11], 0 offset:772 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v62, 8, v0
+; GFX906-NEXT:    global_load_dwordx4 v[5:8], v61, s[6:7] offset:224
+; GFX906-NEXT:    global_load_dwordx4 v[9:12], v61, s[6:7] offset:208
+; GFX906-NEXT:    global_load_dwordx4 v[13:16], v61, s[6:7] offset:192
+; GFX906-NEXT:    global_load_dwordx4 v[17:20], v61, s[6:7] offset:176
+; GFX906-NEXT:    global_load_dwordx4 v[21:24], v61, s[6:7] offset:160
+; GFX906-NEXT:    global_load_dwordx4 v[25:28], v61, s[6:7] offset:144
+; GFX906-NEXT:    global_load_dwordx4 v[29:32], v61, s[6:7] offset:128
+; GFX906-NEXT:    global_load_dwordx4 v[33:36], v61, s[6:7] offset:112
+; GFX906-NEXT:    global_load_dwordx4 v[37:40], v61, s[6:7] offset:96
+; GFX906-NEXT:    global_load_dwordx4 v[41:44], v61, s[6:7] offset:80
+; GFX906-NEXT:    global_load_dwordx4 v[45:48], v61, s[6:7] offset:64
+; GFX906-NEXT:    global_load_dwordx4 v[49:52], v61, s[6:7] offset:48
+; GFX906-NEXT:    global_load_dwordx4 v[53:56], v61, s[6:7] offset:32
+; GFX906-NEXT:    global_load_dwordx4 v[57:60], v61, s[6:7] offset:16
+; GFX906-NEXT:    global_load_dwordx4 v[0:3], v61, s[6:7]
 ; GFX906-NEXT:  .LBB6_2: ; %bb.2
-; GFX906-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX906-NEXT:    v_lshlrev_b16_e32 v61, 8, v61
-; GFX906-NEXT:    v_or_b32_sdwa v1, v1, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v61, off, s[8:11], 0 offset:764 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_lshlrev_b16_e32 v62, 8, v62
-; GFX906-NEXT:    v_or_b32_sdwa v0, v0, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v62, off, s[8:11], 0 offset:772 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v63, off, s[8:11], 0 offset:760 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v61, 8, v61
-; GFX906-NEXT:    v_or_b32_sdwa v2, v2, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v61, off, s[8:11], 0 offset:752 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v61, 8, v61
-; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v61, off, s[8:11], 0 offset:768 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v61, 8, v61
-; GFX906-NEXT:    v_or_b32_sdwa v61, v62, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v62, off, s[8:11], 0 offset:756 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_or_b32_sdwa v0, v0, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v61, off, s[8:11], 0 offset:744 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v62, 8, v62
-; GFX906-NEXT:    v_or_b32_sdwa v62, v63, v62 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v1, v1, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v62, off, s[8:11], 0 offset:748 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v61, 8, v61
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_or_b32_sdwa v61, v62, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v2, v2, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v61, off, s[8:11], 0 offset:736 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v62, off, s[8:11], 0 offset:740 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v61, 8, v61
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_or_b32_sdwa v61, v62, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:732 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_nop 0
-; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:728 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:716 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:704 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX906-NEXT:    v_or_b32_sdwa v0, v57, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; GFX906-NEXT:    buffer_load_dword v57, off, s[8:11], 0 offset:720 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_or_b32_sdwa v1, v58, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v58, off, s[8:11], 0 offset:724 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    v_or_b32_sdwa v2, v59, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v59, off, s[8:11], 0 offset:712 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v57, 8, v57
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_or_b32_sdwa v57, v58, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v58, off, s[8:11], 0 offset:708 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_or_b32_sdwa v0, v0, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v57, off, s[8:11], 0 offset:696 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v58, 8, v58
-; GFX906-NEXT:    v_or_b32_sdwa v58, v59, v58 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v1, v1, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v58, off, s[8:11], 0 offset:700 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v57, 8, v57
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_or_b32_sdwa v57, v58, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v2, v2, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v57, off, s[8:11], 0 offset:688 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v58, off, s[8:11], 0 offset:692 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
-; GFX906-NEXT:    v_or_b32_sdwa v3, v60, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v57, 8, v57
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_or_b32_sdwa v57, v58, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:16
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:684 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_nop 0
-; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:680 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:668 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:656 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX906-NEXT:    v_or_b32_sdwa v0, v53, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; GFX906-NEXT:    buffer_load_dword v53, off, s[8:11], 0 offset:672 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_or_b32_sdwa v1, v54, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v54, off, s[8:11], 0 offset:676 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    v_or_b32_sdwa v2, v55, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v55, off, s[8:11], 0 offset:664 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v53, 8, v53
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_or_b32_sdwa v53, v54, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v54, off, s[8:11], 0 offset:660 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_or_b32_sdwa v0, v0, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v53, off, s[8:11], 0 offset:648 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v54, 8, v54
-; GFX906-NEXT:    v_or_b32_sdwa v54, v55, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v1, v1, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v54, off, s[8:11], 0 offset:652 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v53, 8, v53
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_or_b32_sdwa v53, v54, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v2, v2, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v53, off, s[8:11], 0 offset:640 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v54, off, s[8:11], 0 offset:644 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
-; GFX906-NEXT:    v_or_b32_sdwa v3, v56, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v53, 8, v53
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_or_b32_sdwa v53, v54, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:32
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:636 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_nop 0
-; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:632 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:620 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:608 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX906-NEXT:    v_or_b32_sdwa v0, v49, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; GFX906-NEXT:    buffer_load_dword v49, off, s[8:11], 0 offset:624 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_or_b32_sdwa v1, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v50, off, s[8:11], 0 offset:628 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    v_or_b32_sdwa v2, v51, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v51, off, s[8:11], 0 offset:616 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v49, 8, v49
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_or_b32_sdwa v49, v50, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v50, off, s[8:11], 0 offset:612 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_or_b32_sdwa v0, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v49, off, s[8:11], 0 offset:600 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v50, 8, v50
-; GFX906-NEXT:    v_or_b32_sdwa v50, v51, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v1, v1, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v50, off, s[8:11], 0 offset:604 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v49, 8, v49
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_or_b32_sdwa v49, v50, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v2, v2, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v49, off, s[8:11], 0 offset:592 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v50, off, s[8:11], 0 offset:596 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
-; GFX906-NEXT:    v_or_b32_sdwa v3, v52, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v49, 8, v49
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_or_b32_sdwa v49, v50, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:48
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:588 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_nop 0
-; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:584 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:572 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:560 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX906-NEXT:    v_or_b32_sdwa v0, v45, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; GFX906-NEXT:    buffer_load_dword v45, off, s[8:11], 0 offset:576 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_or_b32_sdwa v1, v46, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v46, off, s[8:11], 0 offset:580 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    v_or_b32_sdwa v2, v47, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v47, off, s[8:11], 0 offset:568 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v45, 8, v45
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_or_b32_sdwa v45, v46, v45 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v46, off, s[8:11], 0 offset:564 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_or_b32_sdwa v0, v0, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v45, off, s[8:11], 0 offset:552 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v46, 8, v46
-; GFX906-NEXT:    v_or_b32_sdwa v46, v47, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v1, v1, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v46, off, s[8:11], 0 offset:556 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v45, 8, v45
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_or_b32_sdwa v45, v46, v45 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v2, v2, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v45, off, s[8:11], 0 offset:544 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v46, off, s[8:11], 0 offset:548 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
-; GFX906-NEXT:    v_or_b32_sdwa v3, v48, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v45, 8, v45
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_or_b32_sdwa v45, v46, v45 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:64
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:540 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_nop 0
-; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:536 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:524 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:512 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX906-NEXT:    v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; GFX906-NEXT:    buffer_load_dword v41, off, s[8:11], 0 offset:528 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_or_b32_sdwa v1, v42, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v42, off, s[8:11], 0 offset:532 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    v_or_b32_sdwa v2, v43, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v43, off, s[8:11], 0 offset:520 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v41, 8, v41
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_or_b32_sdwa v41, v42, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v42, off, s[8:11], 0 offset:516 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_or_b32_sdwa v0, v0, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v41, off, s[8:11], 0 offset:504 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v42, 8, v42
-; GFX906-NEXT:    v_or_b32_sdwa v42, v43, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v1, v1, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v42, off, s[8:11], 0 offset:508 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v41, 8, v41
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_or_b32_sdwa v41, v42, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v2, v2, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v41, off, s[8:11], 0 offset:496 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v42, off, s[8:11], 0 offset:500 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
-; GFX906-NEXT:    v_or_b32_sdwa v3, v44, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v41, 8, v41
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_or_b32_sdwa v41, v42, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:80
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:492 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_nop 0
-; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:488 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:476 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:464 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX906-NEXT:    v_or_b32_sdwa v0, v37, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; GFX906-NEXT:    buffer_load_dword v37, off, s[8:11], 0 offset:480 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_or_b32_sdwa v1, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v38, off, s[8:11], 0 offset:484 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    v_or_b32_sdwa v2, v39, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v39, off, s[8:11], 0 offset:472 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v37, 8, v37
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_or_b32_sdwa v37, v38, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v38, off, s[8:11], 0 offset:468 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_or_b32_sdwa v0, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v37, off, s[8:11], 0 offset:456 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v38, 8, v38
-; GFX906-NEXT:    v_or_b32_sdwa v38, v39, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v1, v1, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v38, off, s[8:11], 0 offset:460 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v37, 8, v37
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_or_b32_sdwa v37, v38, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v2, v2, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v37, off, s[8:11], 0 offset:448 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v38, off, s[8:11], 0 offset:452 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
-; GFX906-NEXT:    v_or_b32_sdwa v3, v40, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v37, 8, v37
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_or_b32_sdwa v37, v38, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:96
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:444 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_nop 0
-; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:440 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:428 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:416 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX906-NEXT:    v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; GFX906-NEXT:    buffer_load_dword v33, off, s[8:11], 0 offset:432 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_or_b32_sdwa v1, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v34, off, s[8:11], 0 offset:436 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    v_or_b32_sdwa v2, v35, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v35, off, s[8:11], 0 offset:424 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v33, 8, v33
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_or_b32_sdwa v33, v34, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v34, off, s[8:11], 0 offset:420 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_or_b32_sdwa v0, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v33, off, s[8:11], 0 offset:408 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v34, 8, v34
-; GFX906-NEXT:    v_or_b32_sdwa v34, v35, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v1, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v34, off, s[8:11], 0 offset:412 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v33, 8, v33
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_or_b32_sdwa v33, v34, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v2, v2, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v33, off, s[8:11], 0 offset:400 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v34, off, s[8:11], 0 offset:404 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
-; GFX906-NEXT:    v_or_b32_sdwa v3, v36, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v33, 8, v33
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_or_b32_sdwa v33, v34, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:112
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:396 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_nop 0
-; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:392 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:380 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:368 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX906-NEXT:    v_or_b32_sdwa v0, v29, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; GFX906-NEXT:    buffer_load_dword v29, off, s[8:11], 0 offset:384 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_or_b32_sdwa v1, v30, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v30, off, s[8:11], 0 offset:388 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    v_or_b32_sdwa v2, v31, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v31, off, s[8:11], 0 offset:376 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v29, 8, v29
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_or_b32_sdwa v29, v30, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v30, off, s[8:11], 0 offset:372 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_or_b32_sdwa v0, v0, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v29, off, s[8:11], 0 offset:360 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v30, 8, v30
-; GFX906-NEXT:    v_or_b32_sdwa v30, v31, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v1, v1, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v30, off, s[8:11], 0 offset:364 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v29, 8, v29
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_or_b32_sdwa v29, v30, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v2, v2, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v29, off, s[8:11], 0 offset:352 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v30, off, s[8:11], 0 offset:356 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
-; GFX906-NEXT:    v_or_b32_sdwa v3, v32, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v29, 8, v29
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_or_b32_sdwa v29, v30, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:128
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:348 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_nop 0
-; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:344 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:332 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:320 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX906-NEXT:    v_or_b32_sdwa v0, v25, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; GFX906-NEXT:    buffer_load_dword v25, off, s[8:11], 0 offset:336 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_or_b32_sdwa v1, v26, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v26, off, s[8:11], 0 offset:340 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    v_or_b32_sdwa v2, v27, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v27, off, s[8:11], 0 offset:328 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v25, 8, v25
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_or_b32_sdwa v25, v26, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v26, off, s[8:11], 0 offset:324 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_or_b32_sdwa v0, v0, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v25, off, s[8:11], 0 offset:312 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v26, 8, v26
-; GFX906-NEXT:    v_or_b32_sdwa v26, v27, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v1, v1, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v26, off, s[8:11], 0 offset:316 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v25, 8, v25
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_or_b32_sdwa v25, v26, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v2, v2, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v25, off, s[8:11], 0 offset:304 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v26, off, s[8:11], 0 offset:308 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
-; GFX906-NEXT:    v_or_b32_sdwa v3, v28, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v25, 8, v25
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_or_b32_sdwa v25, v26, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:144
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:300 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_nop 0
-; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:296 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:284 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:272 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX906-NEXT:    v_or_b32_sdwa v0, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; GFX906-NEXT:    buffer_load_dword v21, off, s[8:11], 0 offset:288 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_or_b32_sdwa v1, v22, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v22, off, s[8:11], 0 offset:292 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    v_or_b32_sdwa v2, v23, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v23, off, s[8:11], 0 offset:280 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v21, 8, v21
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_or_b32_sdwa v21, v22, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v22, off, s[8:11], 0 offset:276 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_or_b32_sdwa v0, v0, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v21, off, s[8:11], 0 offset:264 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v22, 8, v22
-; GFX906-NEXT:    v_or_b32_sdwa v22, v23, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v1, v1, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v22, off, s[8:11], 0 offset:268 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v21, 8, v21
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_or_b32_sdwa v21, v22, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v2, v2, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v21, off, s[8:11], 0 offset:256 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v22, off, s[8:11], 0 offset:260 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
-; GFX906-NEXT:    v_or_b32_sdwa v3, v24, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v21, 8, v21
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_or_b32_sdwa v21, v22, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:160
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:252 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_nop 0
-; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:248 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:236 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:224 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX906-NEXT:    v_or_b32_sdwa v0, v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; GFX906-NEXT:    buffer_load_dword v17, off, s[8:11], 0 offset:240 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_or_b32_sdwa v1, v18, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v18, off, s[8:11], 0 offset:244 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    v_or_b32_sdwa v2, v19, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v19, off, s[8:11], 0 offset:232 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v17, 8, v17
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v18, off, s[8:11], 0 offset:228 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_or_b32_sdwa v0, v0, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v17, off, s[8:11], 0 offset:216 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v18, 8, v18
-; GFX906-NEXT:    v_or_b32_sdwa v18, v19, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v1, v1, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v18, off, s[8:11], 0 offset:220 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v17, 8, v17
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v2, v2, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v17, off, s[8:11], 0 offset:208 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v18, off, s[8:11], 0 offset:212 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
-; GFX906-NEXT:    v_or_b32_sdwa v3, v20, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v17, 8, v17
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:176
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:200 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_nop 0
-; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:204 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:192 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:184 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:188 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
-; GFX906-NEXT:    v_or_b32_sdwa v3, v14, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v14, off, s[8:11], 0 offset:164 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; GFX906-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:196 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:180 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    v_or_b32_sdwa v2, v13, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:176 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v13, off, s[8:11], 0 offset:160 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:172 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v13, 8, v13
-; GFX906-NEXT:    v_or_b32_sdwa v13, v14, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
-; GFX906-NEXT:    v_or_b32_sdwa v3, v15, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:168 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
-; GFX906-NEXT:    v_or_b32_sdwa v3, v16, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:192
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:156 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_nop 0
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:152 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:148 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:140 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX906-NEXT:    v_or_b32_sdwa v0, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v9, off, s[8:11], 0 offset:128 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; GFX906-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:144 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:136 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:132 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:124 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v10, off, s[8:11], 0 offset:116 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
-; GFX906-NEXT:    v_or_b32_sdwa v2, v11, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v3, v9, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:120 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v9, off, s[8:11], 0 offset:112 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v9, 8, v9
-; GFX906-NEXT:    v_or_b32_sdwa v3, v12, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v9, v10, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:208
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:108 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_nop 0
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:104 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:100 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:92 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX906-NEXT:    v_or_b32_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:80 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; GFX906-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:96 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:88 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:84 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:76 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:68 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
-; GFX906-NEXT:    v_or_b32_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:72 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:64 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v5, 8, v5
-; GFX906-NEXT:    v_or_b32_sdwa v3, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:224
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:60 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_nop 0
-; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:4 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v7, off, s[8:11], 0 offset:8 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v8, off, s[8:11], 0 offset:12 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:52 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:56 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:44 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX906-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(0)
+; GFX906-NEXT:    global_store_dwordx4 v4, v[33:36], s[0:1] offset:112
 ; GFX906-NEXT:    s_waitcnt vmcnt(7)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_or_b32_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:48 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:40 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:32 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:36 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:28 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:20 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
-; GFX906-NEXT:    v_or_b32_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:24 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:16 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v5, 8, v5
-; GFX906-NEXT:    v_or_b32_sdwa v3, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:240
+; GFX906-NEXT:    global_store_dwordx4 v4, v[37:40], s[0:1] offset:96
+; GFX906-NEXT:    s_waitcnt vmcnt(7)
+; GFX906-NEXT:    global_store_dwordx4 v4, v[41:44], s[0:1] offset:80
+; GFX906-NEXT:    s_waitcnt vmcnt(7)
+; GFX906-NEXT:    global_store_dwordx4 v4, v[45:48], s[0:1] offset:64
+; GFX906-NEXT:    s_waitcnt vmcnt(7)
+; GFX906-NEXT:    global_store_dwordx4 v4, v[49:52], s[0:1] offset:48
+; GFX906-NEXT:    s_waitcnt vmcnt(7)
+; GFX906-NEXT:    global_store_dwordx4 v4, v[53:56], s[0:1] offset:32
+; GFX906-NEXT:    s_waitcnt vmcnt(7)
+; GFX906-NEXT:    global_store_dwordx4 v4, v[57:60], s[0:1] offset:16
+; GFX906-NEXT:    s_waitcnt vmcnt(7)
+; GFX906-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_nop 0
+; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:4 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:8 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:12 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] offset:240
+; GFX906-NEXT:    global_store_dwordx4 v4, v[5:8], s[0:1] offset:224
+; GFX906-NEXT:    global_store_dwordx4 v4, v[9:12], s[0:1] offset:208
+; GFX906-NEXT:    global_store_dwordx4 v4, v[13:16], s[0:1] offset:192
+; GFX906-NEXT:    global_store_dwordx4 v4, v[17:20], s[0:1] offset:176
+; GFX906-NEXT:    global_store_dwordx4 v4, v[21:24], s[0:1] offset:160
+; GFX906-NEXT:    global_store_dwordx4 v4, v[25:28], s[0:1] offset:144
+; GFX906-NEXT:    global_store_dwordx4 v4, v[29:32], s[0:1] offset:128
 ; GFX906-NEXT:    s_endpgm
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()

>From 8d082d239eb8d064fedefad16862c34b51282d89 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Wed, 21 Feb 2024 08:48:41 -0800
Subject: [PATCH 02/17] Handle loop edge in PHI nodes + Port to
 LateCodegenPrepare + Move LateCodeGenPrepare after CodeSinking + Integrate
 the loops

Change-Id: Iac0baf0ab9e523bf303585b545f060293e6fb4f0
---
 .../Target/AMDGPU/AMDGPUCodeGenPrepare.cpp    | 338 -----------------
 .../AMDGPU/AMDGPULateCodeGenPrepare.cpp       | 356 +++++++++++++++++-
 .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp |   4 +-
 .../amdgpu-codegenprepare-break-large-phis.ll | 133 +++----
 llvm/test/CodeGen/AMDGPU/llc-pipeline.ll      |  24 +-
 llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll     |  24 +-
 llvm/test/CodeGen/AMDGPU/vni8-loop-carried.ll |  46 +++
 7 files changed, 472 insertions(+), 453 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/vni8-loop-carried.ll

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index 8f1dd1c522b04..a37302d3c4126 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -344,85 +344,6 @@ class AMDGPUCodeGenPrepare : public FunctionPass {
   StringRef getPassName() const override { return "AMDGPU IR optimizations"; }
 };
 
-class LiveRegConversion {
-private:
-  // The instruction which defined the original virtual register used across
-  // blocks
-  Instruction *LiveRegDef;
-  // The original type
-  Type *OriginalType;
-  // The desired type
-  Type *NewType;
-  // The instruction sequence that converts the virtual register, to be used
-  // instead of the original
-  std::optional<Instruction *> Converted;
-  // The builder used to build the conversion instruction
-  IRBuilder<> ConvertBuilder;
-
-public:
-  // The instruction which defined the original virtual register used across
-  // blocks
-  Instruction *getLiveRegDef() { return LiveRegDef; }
-  // The original type
-  Type *getOriginalType() { return OriginalType; }
-  // The desired type
-  Type *getNewType() { return NewType; }
-  void setNewType(Type *NewType) { this->NewType = NewType; }
-  // The instruction that conerts the virtual register, to be used instead of
-  // the original
-  std::optional<Instruction *> &getConverted() { return Converted; }
-  void setConverted(Instruction *Converted) { this->Converted = Converted; }
-  // The builder used to build the conversion instruction
-  IRBuilder<> &getConverBuilder() { return ConvertBuilder; }
-  // Do we have a instruction sequence which convert the original virtual
-  // register
-  bool hasConverted() { return Converted.has_value(); }
-
-  LiveRegConversion(Instruction *LiveRegDef, BasicBlock *InsertBlock,
-                    BasicBlock::iterator InsertPt)
-      : LiveRegDef(LiveRegDef), OriginalType(LiveRegDef->getType()),
-        ConvertBuilder(InsertBlock, InsertPt) {}
-  LiveRegConversion(Instruction *LiveRegDef, Type *NewType,
-                    BasicBlock *InsertBlock, BasicBlock::iterator InsertPt)
-      : LiveRegDef(LiveRegDef), OriginalType(LiveRegDef->getType()),
-        NewType(NewType), ConvertBuilder(InsertBlock, InsertPt) {}
-};
-
-class LiveRegOptimizer {
-private:
-  Module *Mod = nullptr;
-  // The scalar type to convert to
-  Type *ConvertToScalar;
-  // Holds the collection of PHIs with their pending new operands
-  SmallVector<std::pair<Instruction *,
-                        SmallVector<std::pair<Instruction *, BasicBlock *>, 4>>,
-              4>
-      PHIUpdater;
-
-public:
-  // Should the def of the instruction be converted if it is live across blocks
-  bool shouldReplaceUses(const Instruction &I);
-  // Convert the virtual register to the compatible vector of legal type
-  void convertToOptType(LiveRegConversion &LR);
-  // Convert the virtual register back to the original type, stripping away
-  // the MSBs in cases where there was an imperfect fit (e.g. v2i32 -> v7i8)
-  void convertFromOptType(LiveRegConversion &LR);
-  // Get a vector of desired scalar type that is compatible with the original
-  // vector. In cases where there is no bitsize equivalent using a legal vector
-  // type, we pad the MSBs (e.g. v7i8 -> v2i32)
-  Type *getCompatibleType(Instruction *InstToConvert);
-  // Find and replace uses of the virtual register in different block with a
-  // newly produced virtual register of legal type
-  bool replaceUses(Instruction &I);
-  // Replace the collected PHIs with newly produced incoming values. Replacement
-  // is only done if we have a replacement for each original incoming value.
-  bool replacePHIs();
-
-  LiveRegOptimizer(Module *Mod) : Mod(Mod) {
-    ConvertToScalar = Type::getInt32Ty(Mod->getContext());
-  }
-};
-
 } // end anonymous namespace
 
 bool AMDGPUCodeGenPrepareImpl::run(Function &F) {
@@ -453,268 +374,9 @@ bool AMDGPUCodeGenPrepareImpl::run(Function &F) {
     }
   }
 
-  // GlobalISel should directly use the values, and do not need to emit
-  // CopyTo/CopyFrom Regs across blocks
-  if (UsesGlobalISel)
-    return MadeChange;
-
-  // "Optimize" the virtual regs that cross basic block boundaries. In such
-  // cases, vectors of illegal types will be scalarized and widened, with each
-  // scalar living in its own physical register. The optimization converts the
-  // vectors to equivalent vectors of legal type (which are convereted back
-  // before uses in subsequenmt blocks), to pack the bits into fewer physical
-  // registers (used in CopyToReg/CopyFromReg pairs).
-  LiveRegOptimizer LRO(Mod);
-  for (auto &BB : F) {
-    for (auto &I : BB) {
-      if (!LRO.shouldReplaceUses(I))
-        continue;
-      MadeChange |= LRO.replaceUses(I);
-    }
-  }
-
-  MadeChange |= LRO.replacePHIs();
   return MadeChange;
 }
 
-bool LiveRegOptimizer::replaceUses(Instruction &I) {
-  bool MadeChange = false;
-
-  struct ConvertUseInfo {
-    Instruction *Converted;
-    SmallVector<Instruction *, 4> Users;
-  };
-  DenseMap<BasicBlock *, ConvertUseInfo> UseConvertTracker;
-
-  LiveRegConversion FromLRC(
-      &I, I.getParent(),
-      static_cast<BasicBlock::iterator>(std::next(I.getIterator())));
-  FromLRC.setNewType(getCompatibleType(FromLRC.getLiveRegDef()));
-  for (auto IUser = I.user_begin(); IUser != I.user_end(); IUser++) {
-
-    if (auto UserInst = dyn_cast<Instruction>(*IUser)) {
-      if (UserInst->getParent() != I.getParent()) {
-        LLVM_DEBUG(dbgs() << *UserInst << "\n\tUses "
-                          << *FromLRC.getOriginalType()
-                          << " from previous block. Needs conversion\n");
-        convertToOptType(FromLRC);
-        if (!FromLRC.hasConverted())
-          continue;
-        // If it is a PHI node, just create and collect the new operand. We can
-        // only replace the PHI node once we have converted all the operands
-        if (auto PhiInst = dyn_cast<PHINode>(UserInst)) {
-          for (unsigned Idx = 0; Idx < PhiInst->getNumIncomingValues(); Idx++) {
-            auto IncVal = PhiInst->getIncomingValue(Idx);
-            if (&I == dyn_cast<Instruction>(IncVal)) {
-              auto IncBlock = PhiInst->getIncomingBlock(Idx);
-              auto PHIOps = find_if(
-                  PHIUpdater,
-                  [&UserInst](
-                      std::pair<Instruction *,
-                                SmallVector<
-                                    std::pair<Instruction *, BasicBlock *>, 4>>
-                          &Entry) { return Entry.first == UserInst; });
-
-              if (PHIOps == PHIUpdater.end())
-                PHIUpdater.push_back(
-                    {UserInst, {{*FromLRC.getConverted(), IncBlock}}});
-              else
-                PHIOps->second.push_back({*FromLRC.getConverted(), IncBlock});
-
-              break;
-            }
-          }
-          continue;
-        }
-
-        // Do not create multiple conversion sequences if there are multiple
-        // uses in the same block
-        if (UseConvertTracker.contains(UserInst->getParent())) {
-          UseConvertTracker[UserInst->getParent()].Users.push_back(UserInst);
-          LLVM_DEBUG(dbgs() << "\tUser already has access to converted def\n");
-          continue;
-        }
-
-        LiveRegConversion ToLRC(*FromLRC.getConverted(), I.getType(),
-                                UserInst->getParent(),
-                                static_cast<BasicBlock::iterator>(
-                                    UserInst->getParent()->getFirstNonPHIIt()));
-        convertFromOptType(ToLRC);
-        assert(ToLRC.hasConverted());
-        UseConvertTracker[UserInst->getParent()] = {*ToLRC.getConverted(),
-                                                    {UserInst}};
-      }
-    }
-  }
-
-  // Replace uses of with in a separate loop that is not dependent upon the
-  // state of the uses
-  for (auto &Entry : UseConvertTracker) {
-    for (auto &UserInst : Entry.second.Users) {
-      LLVM_DEBUG(dbgs() << *UserInst
-                        << "\n\tNow uses: " << *Entry.second.Converted << "\n");
-      UserInst->replaceUsesOfWith(&I, Entry.second.Converted);
-      MadeChange = true;
-    }
-  }
-  return MadeChange;
-}
-
-bool LiveRegOptimizer::replacePHIs() {
-  bool MadeChange = false;
-  for (auto Ele : PHIUpdater) {
-    auto ThePHINode = dyn_cast<PHINode>(Ele.first);
-    assert(ThePHINode);
-    auto NewPHINodeOps = Ele.second;
-    LLVM_DEBUG(dbgs() << "Attempting to replace: " << *ThePHINode << "\n");
-    // If we have conveted all the required operands, then do the replacement
-    if (ThePHINode->getNumIncomingValues() == NewPHINodeOps.size()) {
-      IRBuilder<> Builder(Ele.first);
-      auto NPHI = Builder.CreatePHI(NewPHINodeOps[0].first->getType(),
-                                    NewPHINodeOps.size());
-      for (auto IncVals : NewPHINodeOps) {
-        NPHI->addIncoming(IncVals.first, IncVals.second);
-        LLVM_DEBUG(dbgs() << "  Using: " << *IncVals.first
-                          << "  For: " << IncVals.second->getName() << "\n");
-      }
-      LLVM_DEBUG(dbgs() << "Sucessfully replaced with " << *NPHI << "\n");
-      LiveRegConversion ToLRC(NPHI, ThePHINode->getType(),
-                              ThePHINode->getParent(),
-                              static_cast<BasicBlock::iterator>(
-                                  ThePHINode->getParent()->getFirstNonPHIIt()));
-      convertFromOptType(ToLRC);
-      assert(ToLRC.hasConverted());
-      Ele.first->replaceAllUsesWith(*ToLRC.getConverted());
-      // The old PHI is no longer used
-      ThePHINode->eraseFromParent();
-      MadeChange = true;
-    }
-  }
-  return MadeChange;
-}
-
-Type *LiveRegOptimizer::getCompatibleType(Instruction *InstToConvert) {
-  auto OriginalType = InstToConvert->getType();
-  assert(OriginalType->getScalarSizeInBits() <=
-         ConvertToScalar->getScalarSizeInBits());
-  auto VTy = dyn_cast<VectorType>(OriginalType);
-  if (!VTy)
-    return ConvertToScalar;
-
-  auto OriginalSize =
-      VTy->getScalarSizeInBits() * VTy->getElementCount().getFixedValue();
-  auto ConvertScalarSize = ConvertToScalar->getScalarSizeInBits();
-  auto ConvertEltCount =
-      (OriginalSize + ConvertScalarSize - 1) / ConvertScalarSize;
-
-  return VectorType::get(Type::getIntNTy(Mod->getContext(), ConvertScalarSize),
-                         llvm::ElementCount::getFixed(ConvertEltCount));
-}
-
-void LiveRegOptimizer::convertToOptType(LiveRegConversion &LR) {
-  if (LR.hasConverted()) {
-    LLVM_DEBUG(dbgs() << "\tAlready has converted def\n");
-    return;
-  }
-
-  auto VTy = dyn_cast<VectorType>(LR.getOriginalType());
-  assert(VTy);
-  auto NewVTy = dyn_cast<VectorType>(LR.getNewType());
-  assert(NewVTy);
-
-  auto V = static_cast<Value *>(LR.getLiveRegDef());
-  auto OriginalSize =
-      VTy->getScalarSizeInBits() * VTy->getElementCount().getFixedValue();
-  auto NewSize =
-      NewVTy->getScalarSizeInBits() * NewVTy->getElementCount().getFixedValue();
-
-  auto &Builder = LR.getConverBuilder();
-
-  // If there is a bitsize match, we can fit the old vector into a new vector of
-  // desired type
-  if (OriginalSize == NewSize) {
-    LR.setConverted(dyn_cast<Instruction>(Builder.CreateBitCast(V, NewVTy)));
-    LLVM_DEBUG(dbgs() << "\tConverted def to "
-                      << *(*LR.getConverted())->getType() << "\n");
-    return;
-  }
-
-  // If there is a bitsize mismatch, we must use a wider vector
-  assert(NewSize > OriginalSize);
-  auto ExpandedVecElementCount =
-      llvm::ElementCount::getFixed(NewSize / VTy->getScalarSizeInBits());
-
-  SmallVector<int, 8> ShuffleMask;
-  for (unsigned I = 0; I < VTy->getElementCount().getFixedValue(); I++)
-    ShuffleMask.push_back(I);
-
-  for (uint64_t I = VTy->getElementCount().getFixedValue();
-       I < ExpandedVecElementCount.getFixedValue(); I++)
-    ShuffleMask.push_back(VTy->getElementCount().getFixedValue());
-
-  auto ExpandedVec =
-      dyn_cast<Instruction>(Builder.CreateShuffleVector(V, ShuffleMask));
-  LR.setConverted(
-      dyn_cast<Instruction>(Builder.CreateBitCast(ExpandedVec, NewVTy)));
-  LLVM_DEBUG(dbgs() << "\tConverted def to " << *(*LR.getConverted())->getType()
-                    << "\n");
-  return;
-}
-
-void LiveRegOptimizer::convertFromOptType(LiveRegConversion &LRC) {
-  auto VTy = dyn_cast<VectorType>(LRC.getOriginalType());
-  assert(VTy);
-  auto NewVTy = dyn_cast<VectorType>(LRC.getNewType());
-  assert(NewVTy);
-
-  auto V = static_cast<Value *>(LRC.getLiveRegDef());
-  auto OriginalSize =
-      VTy->getScalarSizeInBits() * VTy->getElementCount().getFixedValue();
-  auto NewSize =
-      NewVTy->getScalarSizeInBits() * NewVTy->getElementCount().getFixedValue();
-
-  auto &Builder = LRC.getConverBuilder();
-
-  // If there is a bitsize match, we simply convert back to the original type
-  if (OriginalSize == NewSize) {
-    LRC.setConverted(dyn_cast<Instruction>(Builder.CreateBitCast(V, NewVTy)));
-    LLVM_DEBUG(dbgs() << "\tProduced for user: " << **LRC.getConverted()
-                      << "\n");
-    return;
-  }
-
-  // If there is a bitsize mismatch, we have used a wider vector and must strip
-  // the MSBs to convert back to the original type
-  assert(OriginalSize > NewSize);
-  auto ExpandedVecElementCount = llvm::ElementCount::getFixed(
-      OriginalSize / NewVTy->getScalarSizeInBits());
-  auto ExpandedVT = VectorType::get(
-      Type::getIntNTy(Mod->getContext(), NewVTy->getScalarSizeInBits()),
-      ExpandedVecElementCount);
-  auto Converted = dyn_cast<Instruction>(
-      Builder.CreateBitCast(LRC.getLiveRegDef(), ExpandedVT));
-
-  auto NarrowElementCount = NewVTy->getElementCount().getFixedValue();
-  SmallVector<int, 8> ShuffleMask;
-  for (uint64_t I = 0; I < NarrowElementCount; I++)
-    ShuffleMask.push_back(I);
-
-  auto NarrowVec = dyn_cast<Instruction>(
-      Builder.CreateShuffleVector(Converted, ShuffleMask));
-  LRC.setConverted(dyn_cast<Instruction>(NarrowVec));
-  LLVM_DEBUG(dbgs() << "\tProduced for user: " << **LRC.getConverted() << "\n");
-  return;
-}
-
-bool LiveRegOptimizer::shouldReplaceUses(const Instruction &I) {
-  // Vectors of illegal types are copied across blocks in an efficient manner.
-  // They are scalarized and widened to legal scalars. In such cases, we can do
-  // better by using legal vector types
-  auto IType = I.getType();
-  return IType->isVectorTy() && IType->getScalarSizeInBits() < 16 &&
-         !I.getType()->getScalarType()->isPointerTy();
-}
-
 unsigned AMDGPUCodeGenPrepareImpl::getBaseElementBitWidth(const Type *T) const {
   assert(needsPromotionToI32(T) && "T does not need promotion to i32");
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
index 69fdeaebe0a01..f19f145ae8606 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
@@ -81,6 +81,85 @@ class AMDGPULateCodeGenPrepare
   bool visitLoadInst(LoadInst &LI);
 };
 
+class LiveRegConversion {
+private:
+  // The instruction which defined the original virtual register used across
+  // blocks
+  Instruction *LiveRegDef;
+  // The original type
+  Type *OriginalType;
+  // The desired type
+  Type *NewType;
+  // The instruction sequence that converts the virtual register, to be used
+  // instead of the original
+  std::optional<Instruction *> Converted;
+  // The builder used to build the conversion instruction
+  IRBuilder<> ConvertBuilder;
+
+public:
+  // The instruction which defined the original virtual register used across
+  // blocks
+  Instruction *getLiveRegDef() { return LiveRegDef; }
+  // The original type
+  Type *getOriginalType() { return OriginalType; }
+  // The desired type
+  Type *getNewType() { return NewType; }
+  void setNewType(Type *NewType) { this->NewType = NewType; }
+  // The instruction that conerts the virtual register, to be used instead of
+  // the original
+  std::optional<Instruction *> &getConverted() { return Converted; }
+  void setConverted(Instruction *Converted) { this->Converted = Converted; }
+  // The builder used to build the conversion instruction
+  IRBuilder<> &getConvertBuilder() { return ConvertBuilder; }
+  // Do we have a instruction sequence which convert the original virtual
+  // register
+  bool hasConverted() { return Converted.has_value(); }
+
+  LiveRegConversion(Instruction *LiveRegDef, BasicBlock *InsertBlock,
+                    BasicBlock::iterator InsertPt)
+      : LiveRegDef(LiveRegDef), OriginalType(LiveRegDef->getType()),
+        ConvertBuilder(InsertBlock, InsertPt) {}
+  LiveRegConversion(Instruction *LiveRegDef, Type *NewType,
+                    BasicBlock *InsertBlock, BasicBlock::iterator InsertPt)
+      : LiveRegDef(LiveRegDef), OriginalType(LiveRegDef->getType()),
+        NewType(NewType), ConvertBuilder(InsertBlock, InsertPt) {}
+};
+
+class LiveRegOptimizer {
+private:
+  Module *Mod = nullptr;
+  // The scalar type to convert to
+  Type *ConvertToScalar;
+  // Holds the collection of PHIs with their pending new operands
+  SmallVector<std::pair<Instruction *,
+                        SmallVector<std::pair<Instruction *, BasicBlock *>, 4>>,
+              4>
+      PHIUpdater;
+
+public:
+  // Should the def of the instruction be converted if it is live across blocks
+  bool shouldReplaceUses(const Instruction &I);
+  // Convert the virtual register to the compatible vector of legal type
+  void convertToOptType(LiveRegConversion &LR);
+  // Convert the virtual register back to the original type, stripping away
+  // the MSBs in cases where there was an imperfect fit (e.g. v2i32 -> v7i8)
+  void convertFromOptType(LiveRegConversion &LR);
+  // Get a vector of desired scalar type that is compatible with the original
+  // vector. In cases where there is no bitsize equivalent using a legal vector
+  // type, we pad the MSBs (e.g. v7i8 -> v2i32)
+  Type *getCompatibleType(Instruction *InstToConvert);
+  // Find and replace uses of the virtual register in different block with a
+  // newly produced virtual register of legal type
+  bool replaceUses(Instruction &I);
+  // Replace the collected PHIs with newly produced incoming values. Replacement
+  // is only done if we have a replacement for each original incoming value.
+  bool replacePHIs();
+
+  LiveRegOptimizer(Module *Mod) : Mod(Mod) {
+    ConvertToScalar = Type::getInt32Ty(Mod->getContext());
+  }
+};
+
 } // end anonymous namespace
 
 bool AMDGPULateCodeGenPrepare::doInitialization(Module &M) {
@@ -102,14 +181,287 @@ bool AMDGPULateCodeGenPrepare::runOnFunction(Function &F) {
   AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
   UA = &getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
 
+  // "Optimize" the virtual regs that cross basic block boundaries. In such
+  // cases, vectors of illegal types will be scalarized and widened, with each
+  // scalar living in its own physical register. The optimization converts the
+  // vectors to equivalent vectors of legal type (which are convereted back
+  // before uses in subsequenmt blocks), to pack the bits into fewer physical
+  // registers (used in CopyToReg/CopyFromReg pairs).
+  LiveRegOptimizer LRO(Mod);
+
   bool Changed = false;
   for (auto &BB : F)
-    for (Instruction &I : llvm::make_early_inc_range(BB))
+    for (Instruction &I : llvm::make_early_inc_range(BB)) {
       Changed |= visit(I);
-
+      // GlobalISel should directly use the values, and do not need to emit
+      // CopyTo/CopyFrom Regs across blocks
+      if (TM.Options.EnableGlobalISel)
+        continue;
+      if (!LRO.shouldReplaceUses(I))
+        continue;
+      Changed |= LRO.replaceUses(I);
+    }
+
+  Changed |= LRO.replacePHIs();
   return Changed;
 }
 
+bool LiveRegOptimizer::replaceUses(Instruction &I) {
+  bool MadeChange = false;
+
+  struct ConvertUseInfo {
+    Instruction *Converted;
+    SmallVector<Instruction *, 4> Users;
+  };
+  DenseMap<BasicBlock *, ConvertUseInfo> UseConvertTracker;
+
+  LiveRegConversion FromLRC(
+      &I, I.getParent(),
+      static_cast<BasicBlock::iterator>(std::next(I.getIterator())));
+  FromLRC.setNewType(getCompatibleType(FromLRC.getLiveRegDef()));
+  for (auto IUser = I.user_begin(); IUser != I.user_end(); IUser++) {
+
+    if (auto UserInst = dyn_cast<Instruction>(*IUser)) {
+      if (UserInst->getParent() != I.getParent() || isa<PHINode>(UserInst)) {
+        LLVM_DEBUG(dbgs() << *UserInst << "\n\tUses "
+                          << *FromLRC.getOriginalType()
+                          << " from previous block. Needs conversion\n");
+        convertToOptType(FromLRC);
+        if (!FromLRC.hasConverted())
+          continue;
+        // If it is a PHI node, just create and collect the new operand. We can
+        // only replace the PHI node once we have converted all the operands
+        if (auto PhiInst = dyn_cast<PHINode>(UserInst)) {
+          for (unsigned Idx = 0; Idx < PhiInst->getNumIncomingValues(); Idx++) {
+            auto IncVal = PhiInst->getIncomingValue(Idx);
+            if (&I == dyn_cast<Instruction>(IncVal)) {
+              auto IncBlock = PhiInst->getIncomingBlock(Idx);
+              auto PHIOps = find_if(
+                  PHIUpdater,
+                  [&UserInst](
+                      std::pair<Instruction *,
+                                SmallVector<
+                                    std::pair<Instruction *, BasicBlock *>, 4>>
+                          &Entry) { return Entry.first == UserInst; });
+
+              if (PHIOps == PHIUpdater.end())
+                PHIUpdater.push_back(
+                    {UserInst, {{*FromLRC.getConverted(), IncBlock}}});
+              else
+                PHIOps->second.push_back({*FromLRC.getConverted(), IncBlock});
+
+              break;
+            }
+          }
+          continue;
+        }
+
+        // Do not create multiple conversion sequences if there are multiple
+        // uses in the same block
+        if (UseConvertTracker.contains(UserInst->getParent())) {
+          UseConvertTracker[UserInst->getParent()].Users.push_back(UserInst);
+          LLVM_DEBUG(dbgs() << "\tUser already has access to converted def\n");
+          continue;
+        }
+
+        LiveRegConversion ToLRC(*FromLRC.getConverted(), I.getType(),
+                                UserInst->getParent(),
+                                static_cast<BasicBlock::iterator>(
+                                    UserInst->getParent()->getFirstNonPHIIt()));
+        convertFromOptType(ToLRC);
+        assert(ToLRC.hasConverted());
+        UseConvertTracker[UserInst->getParent()] = {*ToLRC.getConverted(),
+                                                    {UserInst}};
+      }
+    }
+  }
+
+  // Replace uses of with in a separate loop that is not dependent upon the
+  // state of the uses
+  for (auto &Entry : UseConvertTracker) {
+    for (auto &UserInst : Entry.second.Users) {
+      LLVM_DEBUG(dbgs() << *UserInst
+                        << "\n\tNow uses: " << *Entry.second.Converted << "\n");
+      UserInst->replaceUsesOfWith(&I, Entry.second.Converted);
+      MadeChange = true;
+    }
+  }
+  return MadeChange;
+}
+
+bool LiveRegOptimizer::replacePHIs() {
+  bool MadeChange = false;
+  for (auto Ele : PHIUpdater) {
+    auto ThePHINode = cast<PHINode>(Ele.first);
+    auto NewPHINodeOps = Ele.second;
+    LLVM_DEBUG(dbgs() << "Attempting to replace: " << *ThePHINode << "\n");
+    // If we have conveted all the required operands, then do the replacement
+    if (ThePHINode->getNumIncomingValues() == NewPHINodeOps.size()) {
+      IRBuilder<> Builder(Ele.first);
+      auto NPHI = Builder.CreatePHI(NewPHINodeOps[0].first->getType(),
+                                    NewPHINodeOps.size());
+      for (auto IncVals : NewPHINodeOps) {
+        NPHI->addIncoming(IncVals.first, IncVals.second);
+        LLVM_DEBUG(dbgs() << "  Using: " << *IncVals.first
+                          << "  For: " << IncVals.second->getName() << "\n");
+      }
+      LLVM_DEBUG(dbgs() << "Sucessfully replaced with " << *NPHI << "\n");
+      LiveRegConversion ToLRC(NPHI, ThePHINode->getType(),
+                              ThePHINode->getParent(),
+                              static_cast<BasicBlock::iterator>(
+                                  ThePHINode->getParent()->getFirstNonPHIIt()));
+      convertFromOptType(ToLRC);
+      assert(ToLRC.hasConverted());
+      Ele.first->replaceAllUsesWith(*ToLRC.getConverted());
+      // The old PHI is no longer used
+      ThePHINode->eraseFromParent();
+      MadeChange = true;
+    }
+  }
+  return MadeChange;
+}
+
+Type *LiveRegOptimizer::getCompatibleType(Instruction *InstToConvert) {
+  auto OriginalType = InstToConvert->getType();
+  assert(OriginalType->getScalarSizeInBits() <=
+         ConvertToScalar->getScalarSizeInBits());
+  auto VTy = dyn_cast<VectorType>(OriginalType);
+  if (!VTy)
+    return ConvertToScalar;
+
+  auto OriginalSize =
+      VTy->getScalarSizeInBits() * VTy->getElementCount().getFixedValue();
+  auto ConvertScalarSize = ConvertToScalar->getScalarSizeInBits();
+  auto ConvertEltCount =
+      (OriginalSize + ConvertScalarSize - 1) / ConvertScalarSize;
+
+  if (OriginalSize <= ConvertScalarSize)
+    return IntegerType::get(Mod->getContext(), ConvertScalarSize);
+
+  return VectorType::get(Type::getIntNTy(Mod->getContext(), ConvertScalarSize),
+                         llvm::ElementCount::getFixed(ConvertEltCount));
+}
+
+void LiveRegOptimizer::convertToOptType(LiveRegConversion &LR) {
+  if (LR.hasConverted()) {
+    LLVM_DEBUG(dbgs() << "\tAlready has converted def\n");
+    return;
+  }
+
+  auto VTy = cast<VectorType>(LR.getOriginalType());
+
+  auto NewTy = LR.getNewType();
+  assert(NewTy);
+  auto NewVTy = NewTy->isVectorTy() ? cast<VectorType>(NewTy) : nullptr;
+
+  auto V = static_cast<Value *>(LR.getLiveRegDef());
+  auto OriginalSize =
+      VTy->getScalarSizeInBits() * VTy->getElementCount().getFixedValue();
+  auto NewSize = NewTy->isVectorTy()
+                     ? NewVTy->getScalarSizeInBits() *
+                           NewVTy->getElementCount().getFixedValue()
+                     : NewTy->getScalarSizeInBits();
+
+  auto &Builder = LR.getConvertBuilder();
+
+  // If there is a bitsize match, we can fit the old vector into a new vector of
+  // desired type
+  if (OriginalSize == NewSize) {
+    LR.setConverted(dyn_cast<Instruction>(Builder.CreateBitCast(V, NewTy)));
+    LLVM_DEBUG(dbgs() << "\tConverted def to "
+                      << *(*LR.getConverted())->getType() << "\n");
+    return;
+  }
+
+  // If there is a bitsize mismatch, we must use a wider vector
+  assert(NewSize > OriginalSize);
+  auto ExpandedVecElementCount =
+      llvm::ElementCount::getFixed(NewSize / VTy->getScalarSizeInBits());
+
+  SmallVector<int, 8> ShuffleMask;
+  for (unsigned I = 0; I < VTy->getElementCount().getFixedValue(); I++)
+    ShuffleMask.push_back(I);
+
+  for (uint64_t I = VTy->getElementCount().getFixedValue();
+       I < ExpandedVecElementCount.getFixedValue(); I++)
+    ShuffleMask.push_back(VTy->getElementCount().getFixedValue());
+
+  auto ExpandedVec =
+      dyn_cast<Instruction>(Builder.CreateShuffleVector(V, ShuffleMask));
+  LR.setConverted(
+      dyn_cast<Instruction>(Builder.CreateBitCast(ExpandedVec, NewTy)));
+  LLVM_DEBUG(dbgs() << "\tConverted def to " << *(*LR.getConverted())->getType()
+                    << "\n");
+  return;
+}
+
+void LiveRegOptimizer::convertFromOptType(LiveRegConversion &LRC) {
+  auto OTy = LRC.getOriginalType();
+  auto VTy =
+      OTy->isVectorTy() ? dyn_cast<VectorType>(LRC.getOriginalType()) : nullptr;
+
+  auto NewVTy = cast<VectorType>(LRC.getNewType());
+
+  auto V = static_cast<Value *>(LRC.getLiveRegDef());
+  auto OriginalSize =
+      OTy->isVectorTy()
+          ? VTy->getScalarSizeInBits() * VTy->getElementCount().getFixedValue()
+          : OTy->getScalarSizeInBits();
+  auto NewSize =
+      NewVTy->getScalarSizeInBits() * NewVTy->getElementCount().getFixedValue();
+
+  auto &Builder = LRC.getConvertBuilder();
+
+  // If there is a bitsize match, we simply convert back to the original type
+  if (OriginalSize == NewSize) {
+    LRC.setConverted(dyn_cast<Instruction>(Builder.CreateBitCast(V, NewVTy)));
+    LLVM_DEBUG(dbgs() << "\tProduced for user: " << **LRC.getConverted()
+                      << "\n");
+    return;
+  }
+
+  if (!OTy->isVectorTy()) {
+    auto Trunc = dyn_cast<Instruction>(Builder.CreateTrunc(
+        LRC.getLiveRegDef(), IntegerType::get(Mod->getContext(), NewSize)));
+    auto Original = dyn_cast<Instruction>(Builder.CreateBitCast(Trunc, NewVTy));
+    LRC.setConverted(dyn_cast<Instruction>(Original));
+    LLVM_DEBUG(dbgs() << "\tProduced for user: " << **LRC.getConverted()
+                      << "\n");
+    return;
+  }
+
+  // If there is a bitsize mismatch, we have used a wider vector and must strip
+  // the MSBs to convert back to the original type
+  assert(OriginalSize > NewSize);
+  auto ExpandedVecElementCount = llvm::ElementCount::getFixed(
+      OriginalSize / NewVTy->getScalarSizeInBits());
+  auto ExpandedVT = VectorType::get(
+      Type::getIntNTy(Mod->getContext(), NewVTy->getScalarSizeInBits()),
+      ExpandedVecElementCount);
+  auto Converted = dyn_cast<Instruction>(
+      Builder.CreateBitCast(LRC.getLiveRegDef(), ExpandedVT));
+
+  auto NarrowElementCount = NewVTy->getElementCount().getFixedValue();
+  SmallVector<int, 8> ShuffleMask;
+  for (uint64_t I = 0; I < NarrowElementCount; I++)
+    ShuffleMask.push_back(I);
+
+  auto NarrowVec = dyn_cast<Instruction>(
+      Builder.CreateShuffleVector(Converted, ShuffleMask));
+  LRC.setConverted(dyn_cast<Instruction>(NarrowVec));
+  LLVM_DEBUG(dbgs() << "\tProduced for user: " << **LRC.getConverted() << "\n");
+  return;
+}
+
+bool LiveRegOptimizer::shouldReplaceUses(const Instruction &I) {
+  // Vectors of illegal types are copied across blocks in an efficient manner.
+  // They are scalarized and widened to legal scalars. In such cases, we can do
+  // better by using legal vector types
+  auto IType = I.getType();
+  return IType->isVectorTy() && IType->getScalarSizeInBits() < 16 &&
+         !I.getType()->getScalarType()->isPointerTy();
+}
+
 bool AMDGPULateCodeGenPrepare::canWidenScalarExtLoad(LoadInst &LI) const {
   unsigned AS = LI.getPointerAddressSpace();
   // Skip non-constant address space.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 305a6c8c3b926..c15481336075e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -1150,10 +1150,10 @@ bool GCNPassConfig::addPreISel() {
   AMDGPUPassConfig::addPreISel();
 
   if (TM->getOptLevel() > CodeGenOptLevel::None)
-    addPass(createAMDGPULateCodeGenPreparePass());
+    addPass(createSinkingPass());
 
   if (TM->getOptLevel() > CodeGenOptLevel::None)
-    addPass(createSinkingPass());
+    addPass(createAMDGPULateCodeGenPreparePass());
 
   // Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit
   // regions formed by them.
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-break-large-phis.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-break-large-phis.ll
index f4871fa131442..11772d252a16f 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-break-large-phis.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-break-large-phis.ll
@@ -495,15 +495,10 @@ define amdgpu_kernel void @phi_v23i8(<23 x i8> %in, ptr %out, i1 %cond) {
 ; OPT:       then:
 ; OPT-NEXT:    [[X:%.*]] = insertelement <23 x i8> [[IN:%.*]], i8 42, i32 3
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE0:%.*]] = shufflevector <23 x i8> [[X]], <23 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; OPT-NEXT:    [[TMP0:%.*]] = bitcast <4 x i8> [[LARGEPHI_EXTRACTSLICE0]] to <1 x i32>
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE2:%.*]] = shufflevector <23 x i8> [[X]], <23 x i8> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; OPT-NEXT:    [[TMP1:%.*]] = bitcast <4 x i8> [[LARGEPHI_EXTRACTSLICE2]] to <1 x i32>
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE4:%.*]] = shufflevector <23 x i8> [[X]], <23 x i8> poison, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; OPT-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[LARGEPHI_EXTRACTSLICE4]] to <1 x i32>
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE6:%.*]] = shufflevector <23 x i8> [[X]], <23 x i8> poison, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-; OPT-NEXT:    [[TMP3:%.*]] = bitcast <4 x i8> [[LARGEPHI_EXTRACTSLICE6]] to <1 x i32>
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE8:%.*]] = shufflevector <23 x i8> [[X]], <23 x i8> poison, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
-; OPT-NEXT:    [[TMP4:%.*]] = bitcast <4 x i8> [[LARGEPHI_EXTRACTSLICE8]] to <1 x i32>
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE10:%.*]] = extractelement <23 x i8> [[X]], i64 20
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE12:%.*]] = extractelement <23 x i8> [[X]], i64 21
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE14:%.*]] = extractelement <23 x i8> [[X]], i64 22
@@ -511,41 +506,31 @@ define amdgpu_kernel void @phi_v23i8(<23 x i8> %in, ptr %out, i1 %cond) {
 ; OPT:       else:
 ; OPT-NEXT:    [[Y:%.*]] = insertelement <23 x i8> [[IN]], i8 64, i32 6
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE1:%.*]] = shufflevector <23 x i8> [[Y]], <23 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; OPT-NEXT:    [[TMP5:%.*]] = bitcast <4 x i8> [[LARGEPHI_EXTRACTSLICE1]] to <1 x i32>
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE3:%.*]] = shufflevector <23 x i8> [[Y]], <23 x i8> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; OPT-NEXT:    [[TMP6:%.*]] = bitcast <4 x i8> [[LARGEPHI_EXTRACTSLICE3]] to <1 x i32>
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE5:%.*]] = shufflevector <23 x i8> [[Y]], <23 x i8> poison, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; OPT-NEXT:    [[TMP7:%.*]] = bitcast <4 x i8> [[LARGEPHI_EXTRACTSLICE5]] to <1 x i32>
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE7:%.*]] = shufflevector <23 x i8> [[Y]], <23 x i8> poison, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-; OPT-NEXT:    [[TMP8:%.*]] = bitcast <4 x i8> [[LARGEPHI_EXTRACTSLICE7]] to <1 x i32>
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE9:%.*]] = shufflevector <23 x i8> [[Y]], <23 x i8> poison, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
-; OPT-NEXT:    [[TMP9:%.*]] = bitcast <4 x i8> [[LARGEPHI_EXTRACTSLICE9]] to <1 x i32>
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE11:%.*]] = extractelement <23 x i8> [[Y]], i64 20
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE13:%.*]] = extractelement <23 x i8> [[Y]], i64 21
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE15:%.*]] = extractelement <23 x i8> [[Y]], i64 22
 ; OPT-NEXT:    br label [[FINALLY]]
 ; OPT:       finally:
-; OPT-NEXT:    [[TMP10:%.*]] = phi <1 x i32> [ [[TMP0]], [[THEN]] ], [ [[TMP5]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP11:%.*]] = phi <1 x i32> [ [[TMP1]], [[THEN]] ], [ [[TMP6]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP12:%.*]] = phi <1 x i32> [ [[TMP2]], [[THEN]] ], [ [[TMP7]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP13:%.*]] = phi <1 x i32> [ [[TMP3]], [[THEN]] ], [ [[TMP8]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP14:%.*]] = phi <1 x i32> [ [[TMP4]], [[THEN]] ], [ [[TMP9]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP15:%.*]] = phi i8 [ [[LARGEPHI_EXTRACTSLICE10]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE11]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP16:%.*]] = phi i8 [ [[LARGEPHI_EXTRACTSLICE12]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE13]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP17:%.*]] = phi i8 [ [[LARGEPHI_EXTRACTSLICE14]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE15]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP18:%.*]] = bitcast <1 x i32> [[TMP14]] to <4 x i8>
-; OPT-NEXT:    [[TMP19:%.*]] = bitcast <1 x i32> [[TMP13]] to <4 x i8>
-; OPT-NEXT:    [[TMP20:%.*]] = bitcast <1 x i32> [[TMP12]] to <4 x i8>
-; OPT-NEXT:    [[TMP21:%.*]] = bitcast <1 x i32> [[TMP11]] to <4 x i8>
-; OPT-NEXT:    [[TMP22:%.*]] = bitcast <1 x i32> [[TMP10]] to <4 x i8>
-; OPT-NEXT:    [[LARGEPHI_INSERTSLICE0:%.*]] = call <23 x i8> @llvm.vector.insert.v23i8.v4i8(<23 x i8> poison, <4 x i8> [[TMP22]], i64 0)
-; OPT-NEXT:    [[LARGEPHI_INSERTSLICE1:%.*]] = call <23 x i8> @llvm.vector.insert.v23i8.v4i8(<23 x i8> [[LARGEPHI_INSERTSLICE0]], <4 x i8> [[TMP21]], i64 4)
-; OPT-NEXT:    [[LARGEPHI_INSERTSLICE2:%.*]] = call <23 x i8> @llvm.vector.insert.v23i8.v4i8(<23 x i8> [[LARGEPHI_INSERTSLICE1]], <4 x i8> [[TMP20]], i64 8)
-; OPT-NEXT:    [[LARGEPHI_INSERTSLICE3:%.*]] = call <23 x i8> @llvm.vector.insert.v23i8.v4i8(<23 x i8> [[LARGEPHI_INSERTSLICE2]], <4 x i8> [[TMP19]], i64 12)
-; OPT-NEXT:    [[LARGEPHI_INSERTSLICE4:%.*]] = call <23 x i8> @llvm.vector.insert.v23i8.v4i8(<23 x i8> [[LARGEPHI_INSERTSLICE3]], <4 x i8> [[TMP18]], i64 16)
-; OPT-NEXT:    [[LARGEPHI_INSERTSLICE5:%.*]] = insertelement <23 x i8> [[LARGEPHI_INSERTSLICE4]], i8 [[TMP15]], i64 20
-; OPT-NEXT:    [[LARGEPHI_INSERTSLICE6:%.*]] = insertelement <23 x i8> [[LARGEPHI_INSERTSLICE5]], i8 [[TMP16]], i64 21
-; OPT-NEXT:    [[LARGEPHI_INSERTSLICE7:%.*]] = insertelement <23 x i8> [[LARGEPHI_INSERTSLICE6]], i8 [[TMP17]], i64 22
+; OPT-NEXT:    [[TMP0:%.*]] = phi <4 x i8> [ [[LARGEPHI_EXTRACTSLICE0]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE1]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP1:%.*]] = phi <4 x i8> [ [[LARGEPHI_EXTRACTSLICE2]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE3]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP2:%.*]] = phi <4 x i8> [ [[LARGEPHI_EXTRACTSLICE4]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE5]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP3:%.*]] = phi <4 x i8> [ [[LARGEPHI_EXTRACTSLICE6]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE7]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP4:%.*]] = phi <4 x i8> [ [[LARGEPHI_EXTRACTSLICE8]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE9]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP5:%.*]] = phi i8 [ [[LARGEPHI_EXTRACTSLICE10]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE11]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP6:%.*]] = phi i8 [ [[LARGEPHI_EXTRACTSLICE12]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE13]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP7:%.*]] = phi i8 [ [[LARGEPHI_EXTRACTSLICE14]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE15]], [[ELSE]] ]
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE0:%.*]] = call <23 x i8> @llvm.vector.insert.v23i8.v4i8(<23 x i8> poison, <4 x i8> [[TMP0]], i64 0)
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE1:%.*]] = call <23 x i8> @llvm.vector.insert.v23i8.v4i8(<23 x i8> [[LARGEPHI_INSERTSLICE0]], <4 x i8> [[TMP1]], i64 4)
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE2:%.*]] = call <23 x i8> @llvm.vector.insert.v23i8.v4i8(<23 x i8> [[LARGEPHI_INSERTSLICE1]], <4 x i8> [[TMP2]], i64 8)
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE3:%.*]] = call <23 x i8> @llvm.vector.insert.v23i8.v4i8(<23 x i8> [[LARGEPHI_INSERTSLICE2]], <4 x i8> [[TMP3]], i64 12)
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE4:%.*]] = call <23 x i8> @llvm.vector.insert.v23i8.v4i8(<23 x i8> [[LARGEPHI_INSERTSLICE3]], <4 x i8> [[TMP4]], i64 16)
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE5:%.*]] = insertelement <23 x i8> [[LARGEPHI_INSERTSLICE4]], i8 [[TMP5]], i64 20
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE6:%.*]] = insertelement <23 x i8> [[LARGEPHI_INSERTSLICE5]], i8 [[TMP6]], i64 21
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE7:%.*]] = insertelement <23 x i8> [[LARGEPHI_INSERTSLICE6]], i8 [[TMP7]], i64 22
 ; OPT-NEXT:    store <23 x i8> [[LARGEPHI_INSERTSLICE7]], ptr [[OUT:%.*]], align 1
 ; OPT-NEXT:    ret void
 ;
@@ -554,19 +539,13 @@ define amdgpu_kernel void @phi_v23i8(<23 x i8> %in, ptr %out, i1 %cond) {
 ; NOOPT-NEXT:    br i1 [[COND:%.*]], label [[THEN:%.*]], label [[ELSE:%.*]]
 ; NOOPT:       then:
 ; NOOPT-NEXT:    [[X:%.*]] = insertelement <23 x i8> [[IN:%.*]], i8 42, i32 3
-; NOOPT-NEXT:    [[TMP0:%.*]] = shufflevector <23 x i8> [[X]], <23 x i8> poison, <24 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; NOOPT-NEXT:    [[TMP1:%.*]] = bitcast <24 x i8> [[TMP0]] to <6 x i32>
 ; NOOPT-NEXT:    br label [[FINALLY:%.*]]
 ; NOOPT:       else:
 ; NOOPT-NEXT:    [[Y:%.*]] = insertelement <23 x i8> [[IN]], i8 64, i32 6
-; NOOPT-NEXT:    [[TMP2:%.*]] = shufflevector <23 x i8> [[Y]], <23 x i8> poison, <24 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; NOOPT-NEXT:    [[TMP3:%.*]] = bitcast <24 x i8> [[TMP2]] to <6 x i32>
 ; NOOPT-NEXT:    br label [[FINALLY]]
 ; NOOPT:       finally:
-; NOOPT-NEXT:    [[TMP4:%.*]] = phi <6 x i32> [ [[TMP1]], [[THEN]] ], [ [[TMP3]], [[ELSE]] ]
-; NOOPT-NEXT:    [[TMP5:%.*]] = bitcast <6 x i32> [[TMP4]] to <24 x i8>
-; NOOPT-NEXT:    [[TMP6:%.*]] = shufflevector <24 x i8> [[TMP5]], <24 x i8> poison, <23 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22>
-; NOOPT-NEXT:    store <23 x i8> [[TMP6]], ptr [[OUT:%.*]], align 1
+; NOOPT-NEXT:    [[VAL:%.*]] = phi <23 x i8> [ [[X]], [[THEN]] ], [ [[Y]], [[ELSE]] ]
+; NOOPT-NEXT:    store <23 x i8> [[VAL]], ptr [[OUT:%.*]], align 1
 ; NOOPT-NEXT:    ret void
 ;
 entry:
@@ -593,36 +572,31 @@ define amdgpu_kernel void @phi_v23i8_zeroinit(<23 x i8> %in, ptr %out, i1 %cond)
 ; OPT:       else:
 ; OPT-NEXT:    [[Y:%.*]] = insertelement <23 x i8> [[IN:%.*]], i8 64, i32 6
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE1:%.*]] = shufflevector <23 x i8> [[Y]], <23 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; OPT-NEXT:    [[TMP0:%.*]] = bitcast <4 x i8> [[LARGEPHI_EXTRACTSLICE1]] to <1 x i32>
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE3:%.*]] = shufflevector <23 x i8> [[Y]], <23 x i8> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; OPT-NEXT:    [[TMP1:%.*]] = bitcast <4 x i8> [[LARGEPHI_EXTRACTSLICE3]] to <1 x i32>
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE5:%.*]] = shufflevector <23 x i8> [[Y]], <23 x i8> poison, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; OPT-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[LARGEPHI_EXTRACTSLICE5]] to <1 x i32>
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE7:%.*]] = shufflevector <23 x i8> [[Y]], <23 x i8> poison, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-; OPT-NEXT:    [[TMP3:%.*]] = bitcast <4 x i8> [[LARGEPHI_EXTRACTSLICE7]] to <1 x i32>
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE9:%.*]] = shufflevector <23 x i8> [[Y]], <23 x i8> poison, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
-; OPT-NEXT:    [[TMP4:%.*]] = bitcast <4 x i8> [[LARGEPHI_EXTRACTSLICE9]] to <1 x i32>
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE11:%.*]] = extractelement <23 x i8> [[Y]], i64 20
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE13:%.*]] = extractelement <23 x i8> [[Y]], i64 21
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE15:%.*]] = extractelement <23 x i8> [[Y]], i64 22
 ; OPT-NEXT:    br label [[FINALLY]]
 ; OPT:       finally:
-; OPT-NEXT:    [[TMP5:%.*]] = phi <4 x i8> [ zeroinitializer, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE1]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP6:%.*]] = phi <4 x i8> [ zeroinitializer, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE3]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP7:%.*]] = phi <4 x i8> [ zeroinitializer, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE5]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP8:%.*]] = phi <4 x i8> [ zeroinitializer, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE7]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP9:%.*]] = phi <4 x i8> [ zeroinitializer, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE9]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP10:%.*]] = phi i8 [ 0, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE11]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP11:%.*]] = phi i8 [ 0, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE13]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP12:%.*]] = phi i8 [ 0, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE15]], [[ELSE]] ]
-; OPT-NEXT:    [[LARGEPHI_INSERTSLICE0:%.*]] = call <23 x i8> @llvm.vector.insert.v23i8.v4i8(<23 x i8> poison, <4 x i8> [[TMP5]], i64 0)
-; OPT-NEXT:    [[LARGEPHI_INSERTSLICE1:%.*]] = call <23 x i8> @llvm.vector.insert.v23i8.v4i8(<23 x i8> [[LARGEPHI_INSERTSLICE0]], <4 x i8> [[TMP6]], i64 4)
-; OPT-NEXT:    [[LARGEPHI_INSERTSLICE2:%.*]] = call <23 x i8> @llvm.vector.insert.v23i8.v4i8(<23 x i8> [[LARGEPHI_INSERTSLICE1]], <4 x i8> [[TMP7]], i64 8)
-; OPT-NEXT:    [[LARGEPHI_INSERTSLICE3:%.*]] = call <23 x i8> @llvm.vector.insert.v23i8.v4i8(<23 x i8> [[LARGEPHI_INSERTSLICE2]], <4 x i8> [[TMP8]], i64 12)
-; OPT-NEXT:    [[LARGEPHI_INSERTSLICE4:%.*]] = call <23 x i8> @llvm.vector.insert.v23i8.v4i8(<23 x i8> [[LARGEPHI_INSERTSLICE3]], <4 x i8> [[TMP9]], i64 16)
-; OPT-NEXT:    [[LARGEPHI_INSERTSLICE5:%.*]] = insertelement <23 x i8> [[LARGEPHI_INSERTSLICE4]], i8 [[TMP10]], i64 20
-; OPT-NEXT:    [[LARGEPHI_INSERTSLICE6:%.*]] = insertelement <23 x i8> [[LARGEPHI_INSERTSLICE5]], i8 [[TMP11]], i64 21
-; OPT-NEXT:    [[LARGEPHI_INSERTSLICE7:%.*]] = insertelement <23 x i8> [[LARGEPHI_INSERTSLICE6]], i8 [[TMP12]], i64 22
+; OPT-NEXT:    [[TMP0:%.*]] = phi <4 x i8> [ zeroinitializer, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE1]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP1:%.*]] = phi <4 x i8> [ zeroinitializer, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE3]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP2:%.*]] = phi <4 x i8> [ zeroinitializer, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE5]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP3:%.*]] = phi <4 x i8> [ zeroinitializer, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE7]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP4:%.*]] = phi <4 x i8> [ zeroinitializer, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE9]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP5:%.*]] = phi i8 [ 0, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE11]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP6:%.*]] = phi i8 [ 0, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE13]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP7:%.*]] = phi i8 [ 0, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE15]], [[ELSE]] ]
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE0:%.*]] = call <23 x i8> @llvm.vector.insert.v23i8.v4i8(<23 x i8> poison, <4 x i8> [[TMP0]], i64 0)
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE1:%.*]] = call <23 x i8> @llvm.vector.insert.v23i8.v4i8(<23 x i8> [[LARGEPHI_INSERTSLICE0]], <4 x i8> [[TMP1]], i64 4)
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE2:%.*]] = call <23 x i8> @llvm.vector.insert.v23i8.v4i8(<23 x i8> [[LARGEPHI_INSERTSLICE1]], <4 x i8> [[TMP2]], i64 8)
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE3:%.*]] = call <23 x i8> @llvm.vector.insert.v23i8.v4i8(<23 x i8> [[LARGEPHI_INSERTSLICE2]], <4 x i8> [[TMP3]], i64 12)
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE4:%.*]] = call <23 x i8> @llvm.vector.insert.v23i8.v4i8(<23 x i8> [[LARGEPHI_INSERTSLICE3]], <4 x i8> [[TMP4]], i64 16)
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE5:%.*]] = insertelement <23 x i8> [[LARGEPHI_INSERTSLICE4]], i8 [[TMP5]], i64 20
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE6:%.*]] = insertelement <23 x i8> [[LARGEPHI_INSERTSLICE5]], i8 [[TMP6]], i64 21
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE7:%.*]] = insertelement <23 x i8> [[LARGEPHI_INSERTSLICE6]], i8 [[TMP7]], i64 22
 ; OPT-NEXT:    store <23 x i8> [[LARGEPHI_INSERTSLICE7]], ptr [[OUT:%.*]], align 1
 ; OPT-NEXT:    ret void
 ;
@@ -633,8 +607,6 @@ define amdgpu_kernel void @phi_v23i8_zeroinit(<23 x i8> %in, ptr %out, i1 %cond)
 ; NOOPT-NEXT:    br label [[FINALLY:%.*]]
 ; NOOPT:       else:
 ; NOOPT-NEXT:    [[Y:%.*]] = insertelement <23 x i8> [[IN:%.*]], i8 64, i32 6
-; NOOPT-NEXT:    [[TMP0:%.*]] = shufflevector <23 x i8> [[Y]], <23 x i8> poison, <24 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; NOOPT-NEXT:    [[TMP1:%.*]] = bitcast <24 x i8> [[TMP0]] to <6 x i32>
 ; NOOPT-NEXT:    br label [[FINALLY]]
 ; NOOPT:       finally:
 ; NOOPT-NEXT:    [[VAL:%.*]] = phi <23 x i8> [ zeroinitializer, [[THEN]] ], [ [[Y]], [[ELSE]] ]
@@ -663,28 +635,25 @@ define amdgpu_kernel void @phi_v15i8_random_constant_init(<15 x i8> %in, ptr %ou
 ; OPT:       else:
 ; OPT-NEXT:    [[Y:%.*]] = insertelement <15 x i8> [[IN:%.*]], i8 64, i32 6
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE1:%.*]] = shufflevector <15 x i8> [[Y]], <15 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; OPT-NEXT:    [[TMP0:%.*]] = bitcast <4 x i8> [[LARGEPHI_EXTRACTSLICE1]] to <1 x i32>
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE3:%.*]] = shufflevector <15 x i8> [[Y]], <15 x i8> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; OPT-NEXT:    [[TMP1:%.*]] = bitcast <4 x i8> [[LARGEPHI_EXTRACTSLICE3]] to <1 x i32>
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE5:%.*]] = shufflevector <15 x i8> [[Y]], <15 x i8> poison, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; OPT-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[LARGEPHI_EXTRACTSLICE5]] to <1 x i32>
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE7:%.*]] = extractelement <15 x i8> [[Y]], i64 12
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE9:%.*]] = extractelement <15 x i8> [[Y]], i64 13
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE11:%.*]] = extractelement <15 x i8> [[Y]], i64 14
 ; OPT-NEXT:    br label [[FINALLY]]
 ; OPT:       finally:
-; OPT-NEXT:    [[TMP3:%.*]] = phi <4 x i8> [ <i8 poison, i8 1, i8 2, i8 3>, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE1]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP4:%.*]] = phi <4 x i8> [ <i8 4, i8 undef, i8 6, i8 7>, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE3]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP5:%.*]] = phi <4 x i8> [ <i8 9, i8 10, i8 11, i8 12>, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE5]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP6:%.*]] = phi i8 [ 13, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE7]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP7:%.*]] = phi i8 [ 14, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE9]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP8:%.*]] = phi i8 [ undef, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE11]], [[ELSE]] ]
-; OPT-NEXT:    [[LARGEPHI_INSERTSLICE0:%.*]] = call <15 x i8> @llvm.vector.insert.v15i8.v4i8(<15 x i8> poison, <4 x i8> [[TMP3]], i64 0)
-; OPT-NEXT:    [[LARGEPHI_INSERTSLICE1:%.*]] = call <15 x i8> @llvm.vector.insert.v15i8.v4i8(<15 x i8> [[LARGEPHI_INSERTSLICE0]], <4 x i8> [[TMP4]], i64 4)
-; OPT-NEXT:    [[LARGEPHI_INSERTSLICE2:%.*]] = call <15 x i8> @llvm.vector.insert.v15i8.v4i8(<15 x i8> [[LARGEPHI_INSERTSLICE1]], <4 x i8> [[TMP5]], i64 8)
-; OPT-NEXT:    [[LARGEPHI_INSERTSLICE3:%.*]] = insertelement <15 x i8> [[LARGEPHI_INSERTSLICE2]], i8 [[TMP6]], i64 12
-; OPT-NEXT:    [[LARGEPHI_INSERTSLICE4:%.*]] = insertelement <15 x i8> [[LARGEPHI_INSERTSLICE3]], i8 [[TMP7]], i64 13
-; OPT-NEXT:    [[LARGEPHI_INSERTSLICE5:%.*]] = insertelement <15 x i8> [[LARGEPHI_INSERTSLICE4]], i8 [[TMP8]], i64 14
+; OPT-NEXT:    [[TMP0:%.*]] = phi <4 x i8> [ <i8 poison, i8 1, i8 2, i8 3>, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE1]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP1:%.*]] = phi <4 x i8> [ <i8 4, i8 undef, i8 6, i8 7>, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE3]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP2:%.*]] = phi <4 x i8> [ <i8 9, i8 10, i8 11, i8 12>, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE5]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP3:%.*]] = phi i8 [ 13, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE7]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP4:%.*]] = phi i8 [ 14, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE9]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP5:%.*]] = phi i8 [ undef, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE11]], [[ELSE]] ]
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE0:%.*]] = call <15 x i8> @llvm.vector.insert.v15i8.v4i8(<15 x i8> poison, <4 x i8> [[TMP0]], i64 0)
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE1:%.*]] = call <15 x i8> @llvm.vector.insert.v15i8.v4i8(<15 x i8> [[LARGEPHI_INSERTSLICE0]], <4 x i8> [[TMP1]], i64 4)
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE2:%.*]] = call <15 x i8> @llvm.vector.insert.v15i8.v4i8(<15 x i8> [[LARGEPHI_INSERTSLICE1]], <4 x i8> [[TMP2]], i64 8)
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE3:%.*]] = insertelement <15 x i8> [[LARGEPHI_INSERTSLICE2]], i8 [[TMP3]], i64 12
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE4:%.*]] = insertelement <15 x i8> [[LARGEPHI_INSERTSLICE3]], i8 [[TMP4]], i64 13
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE5:%.*]] = insertelement <15 x i8> [[LARGEPHI_INSERTSLICE4]], i8 [[TMP5]], i64 14
 ; OPT-NEXT:    store <15 x i8> [[LARGEPHI_INSERTSLICE5]], ptr [[OUT:%.*]], align 1
 ; OPT-NEXT:    ret void
 ;
@@ -695,8 +664,6 @@ define amdgpu_kernel void @phi_v15i8_random_constant_init(<15 x i8> %in, ptr %ou
 ; NOOPT-NEXT:    br label [[FINALLY:%.*]]
 ; NOOPT:       else:
 ; NOOPT-NEXT:    [[Y:%.*]] = insertelement <15 x i8> [[IN:%.*]], i8 64, i32 6
-; NOOPT-NEXT:    [[TMP0:%.*]] = shufflevector <15 x i8> [[Y]], <15 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; NOOPT-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
 ; NOOPT-NEXT:    br label [[FINALLY]]
 ; NOOPT:       finally:
 ; NOOPT-NEXT:    [[VAL:%.*]] = phi <15 x i8> [ <i8 poison, i8 1, i8 2, i8 3, i8 4, i8 undef, i8 6, i8 7, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 undef>, [[THEN]] ], [ [[Y]], [[ELSE]] ]
@@ -1020,8 +987,8 @@ define amdgpu_kernel void @phi_v7i16_switch(<7 x i16> %in, ptr %out, i8 %cond) {
 ; OPT-NEXT:  entry:
 ; OPT-NEXT:    [[X:%.*]] = insertelement <7 x i16> [[IN:%.*]], i16 3, i32 3
 ; OPT-NEXT:    switch i8 [[COND:%.*]], label [[ELSE:%.*]] [
-; OPT-NEXT:    i8 0, label [[THEN_1:%.*]]
-; OPT-NEXT:    i8 3, label [[THEN_2:%.*]]
+; OPT-NEXT:      i8 0, label [[THEN_1:%.*]]
+; OPT-NEXT:      i8 3, label [[THEN_2:%.*]]
 ; OPT-NEXT:    ]
 ; OPT:       then.1:
 ; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE0:%.*]] = shufflevector <7 x i16> [[X]], <7 x i16> poison, <2 x i32> <i32 0, i32 1>
@@ -1058,8 +1025,8 @@ define amdgpu_kernel void @phi_v7i16_switch(<7 x i16> %in, ptr %out, i8 %cond) {
 ; NOOPT-NEXT:  entry:
 ; NOOPT-NEXT:    [[X:%.*]] = insertelement <7 x i16> [[IN:%.*]], i16 3, i32 3
 ; NOOPT-NEXT:    switch i8 [[COND:%.*]], label [[ELSE:%.*]] [
-; NOOPT-NEXT:    i8 0, label [[THEN_1:%.*]]
-; NOOPT-NEXT:    i8 3, label [[THEN_2:%.*]]
+; NOOPT-NEXT:      i8 0, label [[THEN_1:%.*]]
+; NOOPT-NEXT:      i8 3, label [[THEN_2:%.*]]
 ; NOOPT-NEXT:    ]
 ; NOOPT:       then.1:
 ; NOOPT-NEXT:    br label [[FINALLY:%.*]]
diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
index 0ff5dd3680dfa..29f9e3bf94d05 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
@@ -251,13 +251,13 @@
 ; GCN-O1-NEXT:        Function Alias Analysis Results
 ; GCN-O1-NEXT:        Flatten the CFG
 ; GCN-O1-NEXT:        Dominator Tree Construction
-; GCN-O1-NEXT:        Cycle Info Analysis
-; GCN-O1-NEXT:        Uniformity Analysis
-; GCN-O1-NEXT:        AMDGPU IR late optimizations
 ; GCN-O1-NEXT:        Basic Alias Analysis (stateless AA impl)
 ; GCN-O1-NEXT:        Function Alias Analysis Results
 ; GCN-O1-NEXT:        Natural Loop Information
 ; GCN-O1-NEXT:        Code sinking
+; GCN-O1-NEXT:        Cycle Info Analysis
+; GCN-O1-NEXT:        Uniformity Analysis
+; GCN-O1-NEXT:        AMDGPU IR late optimizations
 ; GCN-O1-NEXT:        Post-Dominator Tree Construction
 ; GCN-O1-NEXT:        Unify divergent function exit nodes
 ; GCN-O1-NEXT:        Dominator Tree Construction
@@ -546,13 +546,13 @@
 ; GCN-O1-OPTS-NEXT:        Function Alias Analysis Results
 ; GCN-O1-OPTS-NEXT:        Flatten the CFG
 ; GCN-O1-OPTS-NEXT:        Dominator Tree Construction
-; GCN-O1-OPTS-NEXT:        Cycle Info Analysis
-; GCN-O1-OPTS-NEXT:        Uniformity Analysis
-; GCN-O1-OPTS-NEXT:        AMDGPU IR late optimizations
 ; GCN-O1-OPTS-NEXT:        Basic Alias Analysis (stateless AA impl)
 ; GCN-O1-OPTS-NEXT:        Function Alias Analysis Results
 ; GCN-O1-OPTS-NEXT:        Natural Loop Information
 ; GCN-O1-OPTS-NEXT:        Code sinking
+; GCN-O1-OPTS-NEXT:        Cycle Info Analysis
+; GCN-O1-OPTS-NEXT:        Uniformity Analysis
+; GCN-O1-OPTS-NEXT:        AMDGPU IR late optimizations
 ; GCN-O1-OPTS-NEXT:        Post-Dominator Tree Construction
 ; GCN-O1-OPTS-NEXT:        Unify divergent function exit nodes
 ; GCN-O1-OPTS-NEXT:        Dominator Tree Construction
@@ -853,13 +853,13 @@
 ; GCN-O2-NEXT:        Function Alias Analysis Results
 ; GCN-O2-NEXT:        Flatten the CFG
 ; GCN-O2-NEXT:        Dominator Tree Construction
-; GCN-O2-NEXT:        Cycle Info Analysis
-; GCN-O2-NEXT:        Uniformity Analysis
-; GCN-O2-NEXT:        AMDGPU IR late optimizations
 ; GCN-O2-NEXT:        Basic Alias Analysis (stateless AA impl)
 ; GCN-O2-NEXT:        Function Alias Analysis Results
 ; GCN-O2-NEXT:        Natural Loop Information
 ; GCN-O2-NEXT:        Code sinking
+; GCN-O2-NEXT:        Cycle Info Analysis
+; GCN-O2-NEXT:        Uniformity Analysis
+; GCN-O2-NEXT:        AMDGPU IR late optimizations
 ; GCN-O2-NEXT:        Post-Dominator Tree Construction
 ; GCN-O2-NEXT:        Unify divergent function exit nodes
 ; GCN-O2-NEXT:        Dominator Tree Construction
@@ -1174,13 +1174,13 @@
 ; GCN-O3-NEXT:        Function Alias Analysis Results
 ; GCN-O3-NEXT:        Flatten the CFG
 ; GCN-O3-NEXT:        Dominator Tree Construction
-; GCN-O3-NEXT:        Cycle Info Analysis
-; GCN-O3-NEXT:        Uniformity Analysis
-; GCN-O3-NEXT:        AMDGPU IR late optimizations
 ; GCN-O3-NEXT:        Basic Alias Analysis (stateless AA impl)
 ; GCN-O3-NEXT:        Function Alias Analysis Results
 ; GCN-O3-NEXT:        Natural Loop Information
 ; GCN-O3-NEXT:        Code sinking
+; GCN-O3-NEXT:        Cycle Info Analysis
+; GCN-O3-NEXT:        Uniformity Analysis
+; GCN-O3-NEXT:        AMDGPU IR late optimizations
 ; GCN-O3-NEXT:        Post-Dominator Tree Construction
 ; GCN-O3-NEXT:        Unify divergent function exit nodes
 ; GCN-O3-NEXT:        Dominator Tree Construction
diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll
index c9dbadcbd2315..cacdc8237d5f3 100644
--- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll
@@ -2103,10 +2103,7 @@ define void @crash_lshlrevb16_not_reg_op() {
 ; NOSDWA:       ; %bb.0: ; %bb0
 ; NOSDWA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; NOSDWA-NEXT:    s_mov_b64 s[4:5], 0
-; NOSDWA-NEXT:    v_mov_b32_e32 v0, 0xff
-; NOSDWA-NEXT:    v_and_b32_e32 v0, s4, v0
-; NOSDWA-NEXT:    v_lshlrev_b16_e64 v1, 8, 1
-; NOSDWA-NEXT:    v_or_b32_e32 v0, v0, v1
+; NOSDWA-NEXT:    v_mov_b32_e32 v0, 0x100
 ; NOSDWA-NEXT:    s_and_b64 vcc, exec, -1
 ; NOSDWA-NEXT:  .LBB22_1: ; %bb1
 ; NOSDWA-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -2126,9 +2123,7 @@ define void @crash_lshlrevb16_not_reg_op() {
 ; GFX89:       ; %bb.0: ; %bb0
 ; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    s_mov_b64 s[4:5], 0
-; GFX89-NEXT:    v_lshlrev_b16_e64 v0, 8, 1
-; GFX89-NEXT:    v_mov_b32_e32 v1, s4
-; GFX89-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX89-NEXT:    v_mov_b32_e32 v0, 0x100
 ; GFX89-NEXT:    s_and_b64 vcc, exec, -1
 ; GFX89-NEXT:  .LBB22_1: ; %bb1
 ; GFX89-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -2148,8 +2143,7 @@ define void @crash_lshlrevb16_not_reg_op() {
 ; GFX9:       ; %bb.0: ; %bb0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b64 s[4:5], 0
-; GFX9-NEXT:    v_lshlrev_b16_e64 v0, 8, 1
-; GFX9-NEXT:    v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0x100
 ; GFX9-NEXT:    s_and_b64 vcc, exec, -1
 ; GFX9-NEXT:  .LBB22_1: ; %bb1
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -2168,18 +2162,16 @@ define void @crash_lshlrevb16_not_reg_op() {
 ; GFX10-LABEL: crash_lshlrevb16_not_reg_op:
 ; GFX10:       ; %bb.0: ; %bb0
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b16 v0, 8, 1
-; GFX10-NEXT:    s_mov_b32 vcc_lo, exec_lo
-; GFX10-NEXT:    v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX10-NEXT:    s_mov_b64 s[4:5], 0
+; GFX10-NEXT:    s_mov_b32 vcc_lo, exec_lo
 ; GFX10-NEXT:  .LBB22_1: ; %bb1
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_lshl_b32 s6, s4, 3
-; GFX10-NEXT:    v_mov_b32_e32 v1, s4
-; GFX10-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-NEXT:    v_lshrrev_b16 v3, s6, v0
+; GFX10-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-NEXT:    v_lshrrev_b16 v2, s6, 0x100
 ; GFX10-NEXT:    s_mov_b64 s[4:5], 1
-; GFX10-NEXT:    flat_store_byte v[1:2], v3
+; GFX10-NEXT:    flat_store_byte v[0:1], v2
 ; GFX10-NEXT:    s_cbranch_vccnz .LBB22_1
 ; GFX10-NEXT:  ; %bb.2: ; %DummyReturnBlock
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/vni8-loop-carried.ll b/llvm/test/CodeGen/AMDGPU/vni8-loop-carried.ll
new file mode 100644
index 0000000000000..1020990edecac
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/vni8-loop-carried.ll
@@ -0,0 +1,46 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc -mtriple=amdgcn -mcpu=gfx906 < %s | FileCheck --check-prefix=GFX906 %s
+
+define amdgpu_kernel void @v32i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
+; GFX906-LABEL: v32i8_liveout:
+; GFX906:       ; %bb.0: ; %entry
+; GFX906-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX906-NEXT:    v_lshlrev_b32_e32 v1, 5, v0
+; GFX906-NEXT:    v_cmp_lt_u32_e32 vcc, 14, v0
+; GFX906-NEXT:    s_mov_b32 s4, 0x2000604
+; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX906-NEXT:    global_load_dword v1, v1, s[2:3]
+; GFX906-NEXT:    s_mov_b64 s[2:3], 0
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_mov_b32_e32 v0, v1
+; GFX906-NEXT:  .LBB0_1: ; %bb.1
+; GFX906-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX906-NEXT:    s_and_b64 s[6:7], exec, vcc
+; GFX906-NEXT:    s_or_b64 s[2:3], s[6:7], s[2:3]
+; GFX906-NEXT:    v_perm_b32 v0, v1, v0, s4
+; GFX906-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX906-NEXT:    s_cbranch_execnz .LBB0_1
+; GFX906-NEXT:  ; %bb.2: ; %bb.2.loopexit
+; GFX906-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX906-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX906-NEXT:    v_mov_b32_e32 v1, 0
+; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX906-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX906-NEXT:    s_endpgm
+entry:
+  %idx = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep1 = getelementptr <32 x i8>, ptr addrspace(1) %src1, i32 %idx
+  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
+  br label %bb.1
+
+bb.1:
+  %temp = phi <4 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ]
+  %vec2 = shufflevector <4 x i8> %vec1, <4 x i8> %temp, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %cmp = icmp ult i32 %idx, 15
+  br i1 %cmp, label %bb.1, label %bb.2
+  br label %bb.2
+
+bb.2:
+  store <4 x i8> %vec2, ptr addrspace(1) %dst, align 4
+  ret void
+}

>From 6d4aa3918047655a595d8da9e26b1942d45107d4 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Fri, 19 Apr 2024 09:51:25 -0700
Subject: [PATCH 03/17] replace auto

Change-Id: I1b461e3194a27e5e3c45500cae0ef5d4d6540d59
---
 .../AMDGPU/AMDGPULateCodeGenPrepare.cpp       | 58 +++++++++----------
 1 file changed, 29 insertions(+), 29 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
index f19f145ae8606..927c5f1506ae8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
@@ -185,7 +185,7 @@ bool AMDGPULateCodeGenPrepare::runOnFunction(Function &F) {
   // cases, vectors of illegal types will be scalarized and widened, with each
   // scalar living in its own physical register. The optimization converts the
   // vectors to equivalent vectors of legal type (which are convereted back
-  // before uses in subsequenmt blocks), to pack the bits into fewer physical
+  // before uses in subsequent blocks), to pack the bits into fewer physical
   // registers (used in CopyToReg/CopyFromReg pairs).
   LiveRegOptimizer LRO(Mod);
 
@@ -221,7 +221,7 @@ bool LiveRegOptimizer::replaceUses(Instruction &I) {
   FromLRC.setNewType(getCompatibleType(FromLRC.getLiveRegDef()));
   for (auto IUser = I.user_begin(); IUser != I.user_end(); IUser++) {
 
-    if (auto UserInst = dyn_cast<Instruction>(*IUser)) {
+    if (Instruction *UserInst = dyn_cast<Instruction>(*IUser)) {
       if (UserInst->getParent() != I.getParent() || isa<PHINode>(UserInst)) {
         LLVM_DEBUG(dbgs() << *UserInst << "\n\tUses "
                           << *FromLRC.getOriginalType()
@@ -233,9 +233,9 @@ bool LiveRegOptimizer::replaceUses(Instruction &I) {
         // only replace the PHI node once we have converted all the operands
         if (auto PhiInst = dyn_cast<PHINode>(UserInst)) {
           for (unsigned Idx = 0; Idx < PhiInst->getNumIncomingValues(); Idx++) {
-            auto IncVal = PhiInst->getIncomingValue(Idx);
+            Value *IncVal = PhiInst->getIncomingValue(Idx);
             if (&I == dyn_cast<Instruction>(IncVal)) {
-              auto IncBlock = PhiInst->getIncomingBlock(Idx);
+              BasicBlock *IncBlock = PhiInst->getIncomingBlock(Idx);
               auto PHIOps = find_if(
                   PHIUpdater,
                   [&UserInst](
@@ -322,17 +322,17 @@ bool LiveRegOptimizer::replacePHIs() {
 }
 
 Type *LiveRegOptimizer::getCompatibleType(Instruction *InstToConvert) {
-  auto OriginalType = InstToConvert->getType();
+  Type *OriginalType = InstToConvert->getType();
   assert(OriginalType->getScalarSizeInBits() <=
          ConvertToScalar->getScalarSizeInBits());
-  auto VTy = dyn_cast<VectorType>(OriginalType);
+  VectorType *VTy = dyn_cast<VectorType>(OriginalType);
   if (!VTy)
     return ConvertToScalar;
 
-  auto OriginalSize =
+  unsigned OriginalSize =
       VTy->getScalarSizeInBits() * VTy->getElementCount().getFixedValue();
-  auto ConvertScalarSize = ConvertToScalar->getScalarSizeInBits();
-  auto ConvertEltCount =
+  unsigned ConvertScalarSize = ConvertToScalar->getScalarSizeInBits();
+  unsigned ConvertEltCount =
       (OriginalSize + ConvertScalarSize - 1) / ConvertScalarSize;
 
   if (OriginalSize <= ConvertScalarSize)
@@ -348,16 +348,16 @@ void LiveRegOptimizer::convertToOptType(LiveRegConversion &LR) {
     return;
   }
 
-  auto VTy = cast<VectorType>(LR.getOriginalType());
+  VectorType *VTy = cast<VectorType>(LR.getOriginalType());
 
-  auto NewTy = LR.getNewType();
+  Type *NewTy = LR.getNewType();
   assert(NewTy);
-  auto NewVTy = NewTy->isVectorTy() ? cast<VectorType>(NewTy) : nullptr;
+  VectorType *NewVTy = NewTy->isVectorTy() ? cast<VectorType>(NewTy) : nullptr;
 
-  auto V = static_cast<Value *>(LR.getLiveRegDef());
-  auto OriginalSize =
+  Value *V = static_cast<Value *>(LR.getLiveRegDef());
+  unsigned OriginalSize =
       VTy->getScalarSizeInBits() * VTy->getElementCount().getFixedValue();
-  auto NewSize = NewTy->isVectorTy()
+  unsigned NewSize = NewTy->isVectorTy()
                      ? NewVTy->getScalarSizeInBits() *
                            NewVTy->getElementCount().getFixedValue()
                      : NewTy->getScalarSizeInBits();
@@ -375,7 +375,7 @@ void LiveRegOptimizer::convertToOptType(LiveRegConversion &LR) {
 
   // If there is a bitsize mismatch, we must use a wider vector
   assert(NewSize > OriginalSize);
-  auto ExpandedVecElementCount =
+  ElementCount ExpandedVecElementCount =
       llvm::ElementCount::getFixed(NewSize / VTy->getScalarSizeInBits());
 
   SmallVector<int, 8> ShuffleMask;
@@ -396,18 +396,18 @@ void LiveRegOptimizer::convertToOptType(LiveRegConversion &LR) {
 }
 
 void LiveRegOptimizer::convertFromOptType(LiveRegConversion &LRC) {
-  auto OTy = LRC.getOriginalType();
-  auto VTy =
+  Type *OTy = LRC.getOriginalType();
+  VectorType *VTy =
       OTy->isVectorTy() ? dyn_cast<VectorType>(LRC.getOriginalType()) : nullptr;
 
-  auto NewVTy = cast<VectorType>(LRC.getNewType());
+  VectorType *NewVTy = cast<VectorType>(LRC.getNewType());
 
-  auto V = static_cast<Value *>(LRC.getLiveRegDef());
-  auto OriginalSize =
+  Value *V = static_cast<Value *>(LRC.getLiveRegDef());
+  unsigned OriginalSize =
       OTy->isVectorTy()
           ? VTy->getScalarSizeInBits() * VTy->getElementCount().getFixedValue()
           : OTy->getScalarSizeInBits();
-  auto NewSize =
+  unsigned NewSize =
       NewVTy->getScalarSizeInBits() * NewVTy->getElementCount().getFixedValue();
 
   auto &Builder = LRC.getConvertBuilder();
@@ -433,20 +433,20 @@ void LiveRegOptimizer::convertFromOptType(LiveRegConversion &LRC) {
   // If there is a bitsize mismatch, we have used a wider vector and must strip
   // the MSBs to convert back to the original type
   assert(OriginalSize > NewSize);
-  auto ExpandedVecElementCount = llvm::ElementCount::getFixed(
+  ElementCount ExpandedVecElementCount = llvm::ElementCount::getFixed(
       OriginalSize / NewVTy->getScalarSizeInBits());
-  auto ExpandedVT = VectorType::get(
+  VectorType *ExpandedVT = VectorType::get(
       Type::getIntNTy(Mod->getContext(), NewVTy->getScalarSizeInBits()),
       ExpandedVecElementCount);
-  auto Converted = dyn_cast<Instruction>(
+  Instruction *Converted = dyn_cast<Instruction>(
       Builder.CreateBitCast(LRC.getLiveRegDef(), ExpandedVT));
 
-  auto NarrowElementCount = NewVTy->getElementCount().getFixedValue();
+  unsigned NarrowElementCount = NewVTy->getElementCount().getFixedValue();
   SmallVector<int, 8> ShuffleMask;
   for (uint64_t I = 0; I < NarrowElementCount; I++)
     ShuffleMask.push_back(I);
 
-  auto NarrowVec = dyn_cast<Instruction>(
+  Instruction *NarrowVec = dyn_cast<Instruction>(
       Builder.CreateShuffleVector(Converted, ShuffleMask));
   LRC.setConverted(dyn_cast<Instruction>(NarrowVec));
   LLVM_DEBUG(dbgs() << "\tProduced for user: " << **LRC.getConverted() << "\n");
@@ -457,7 +457,7 @@ bool LiveRegOptimizer::shouldReplaceUses(const Instruction &I) {
   // Vectors of illegal types are copied across blocks in an efficient manner.
   // They are scalarized and widened to legal scalars. In such cases, we can do
   // better by using legal vector types
-  auto IType = I.getType();
+  Type *IType = I.getType();
   return IType->isVectorTy() && IType->getScalarSizeInBits() < 16 &&
          !I.getType()->getScalarType()->isPointerTy();
 }
@@ -471,7 +471,7 @@ bool AMDGPULateCodeGenPrepare::canWidenScalarExtLoad(LoadInst &LI) const {
   // Skip non-simple loads.
   if (!LI.isSimple())
     return false;
-  auto *Ty = LI.getType();
+  Type *Ty = LI.getType();
   // Skip aggregate types.
   if (Ty->isAggregateType())
     return false;

>From 85493c0a1df5d90baef912062cd67cfd5a19dc52 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Fri, 19 Apr 2024 10:15:38 -0700
Subject: [PATCH 04/17] Delete std::optional usage

Change-Id: Ia56d86e1acf191d19f6fc43ae780de9bb5118ba9
---
 .../AMDGPU/AMDGPULateCodeGenPrepare.cpp       | 28 +++++++++----------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
index 927c5f1506ae8..070fefc31132c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
@@ -92,7 +92,7 @@ class LiveRegConversion {
   Type *NewType;
   // The instruction sequence that converts the virtual register, to be used
   // instead of the original
-  std::optional<Instruction *> Converted;
+  Instruction *Converted = nullptr;
   // The builder used to build the conversion instruction
   IRBuilder<> ConvertBuilder;
 
@@ -107,13 +107,13 @@ class LiveRegConversion {
   void setNewType(Type *NewType) { this->NewType = NewType; }
   // The instruction that conerts the virtual register, to be used instead of
   // the original
-  std::optional<Instruction *> &getConverted() { return Converted; }
+  Instruction *getConverted() { return Converted; }
   void setConverted(Instruction *Converted) { this->Converted = Converted; }
   // The builder used to build the conversion instruction
   IRBuilder<> &getConvertBuilder() { return ConvertBuilder; }
   // Do we have a instruction sequence which convert the original virtual
   // register
-  bool hasConverted() { return Converted.has_value(); }
+  bool hasConverted() { return Converted != nullptr; }
 
   LiveRegConversion(Instruction *LiveRegDef, BasicBlock *InsertBlock,
                     BasicBlock::iterator InsertPt)
@@ -246,9 +246,9 @@ bool LiveRegOptimizer::replaceUses(Instruction &I) {
 
               if (PHIOps == PHIUpdater.end())
                 PHIUpdater.push_back(
-                    {UserInst, {{*FromLRC.getConverted(), IncBlock}}});
+                    {UserInst, {{FromLRC.getConverted(), IncBlock}}});
               else
-                PHIOps->second.push_back({*FromLRC.getConverted(), IncBlock});
+                PHIOps->second.push_back({FromLRC.getConverted(), IncBlock});
 
               break;
             }
@@ -264,13 +264,13 @@ bool LiveRegOptimizer::replaceUses(Instruction &I) {
           continue;
         }
 
-        LiveRegConversion ToLRC(*FromLRC.getConverted(), I.getType(),
+        LiveRegConversion ToLRC(FromLRC.getConverted(), I.getType(),
                                 UserInst->getParent(),
                                 static_cast<BasicBlock::iterator>(
                                     UserInst->getParent()->getFirstNonPHIIt()));
         convertFromOptType(ToLRC);
         assert(ToLRC.hasConverted());
-        UseConvertTracker[UserInst->getParent()] = {*ToLRC.getConverted(),
+        UseConvertTracker[UserInst->getParent()] = {ToLRC.getConverted(),
                                                     {UserInst}};
       }
     }
@@ -312,7 +312,7 @@ bool LiveRegOptimizer::replacePHIs() {
                                   ThePHINode->getParent()->getFirstNonPHIIt()));
       convertFromOptType(ToLRC);
       assert(ToLRC.hasConverted());
-      Ele.first->replaceAllUsesWith(*ToLRC.getConverted());
+      Ele.first->replaceAllUsesWith(ToLRC.getConverted());
       // The old PHI is no longer used
       ThePHINode->eraseFromParent();
       MadeChange = true;
@@ -368,8 +368,8 @@ void LiveRegOptimizer::convertToOptType(LiveRegConversion &LR) {
   // desired type
   if (OriginalSize == NewSize) {
     LR.setConverted(dyn_cast<Instruction>(Builder.CreateBitCast(V, NewTy)));
-    LLVM_DEBUG(dbgs() << "\tConverted def to "
-                      << *(*LR.getConverted())->getType() << "\n");
+    LLVM_DEBUG(dbgs() << "\tConverted def to " << *LR.getConverted()->getType()
+                      << "\n");
     return;
   }
 
@@ -390,7 +390,7 @@ void LiveRegOptimizer::convertToOptType(LiveRegConversion &LR) {
       dyn_cast<Instruction>(Builder.CreateShuffleVector(V, ShuffleMask));
   LR.setConverted(
       dyn_cast<Instruction>(Builder.CreateBitCast(ExpandedVec, NewTy)));
-  LLVM_DEBUG(dbgs() << "\tConverted def to " << *(*LR.getConverted())->getType()
+  LLVM_DEBUG(dbgs() << "\tConverted def to " << *LR.getConverted()->getType()
                     << "\n");
   return;
 }
@@ -415,7 +415,7 @@ void LiveRegOptimizer::convertFromOptType(LiveRegConversion &LRC) {
   // If there is a bitsize match, we simply convert back to the original type
   if (OriginalSize == NewSize) {
     LRC.setConverted(dyn_cast<Instruction>(Builder.CreateBitCast(V, NewVTy)));
-    LLVM_DEBUG(dbgs() << "\tProduced for user: " << **LRC.getConverted()
+    LLVM_DEBUG(dbgs() << "\tProduced for user: " << *LRC.getConverted()
                       << "\n");
     return;
   }
@@ -425,7 +425,7 @@ void LiveRegOptimizer::convertFromOptType(LiveRegConversion &LRC) {
         LRC.getLiveRegDef(), IntegerType::get(Mod->getContext(), NewSize)));
     auto Original = dyn_cast<Instruction>(Builder.CreateBitCast(Trunc, NewVTy));
     LRC.setConverted(dyn_cast<Instruction>(Original));
-    LLVM_DEBUG(dbgs() << "\tProduced for user: " << **LRC.getConverted()
+    LLVM_DEBUG(dbgs() << "\tProduced for user: " << *LRC.getConverted()
                       << "\n");
     return;
   }
@@ -449,7 +449,7 @@ void LiveRegOptimizer::convertFromOptType(LiveRegConversion &LRC) {
   Instruction *NarrowVec = dyn_cast<Instruction>(
       Builder.CreateShuffleVector(Converted, ShuffleMask));
   LRC.setConverted(dyn_cast<Instruction>(NarrowVec));
-  LLVM_DEBUG(dbgs() << "\tProduced for user: " << **LRC.getConverted() << "\n");
+  LLVM_DEBUG(dbgs() << "\tProduced for user: " << *LRC.getConverted() << "\n");
   return;
 }
 

>From 647885fabec1ae5d6d552196fd277b642ebe0bae Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Fri, 19 Apr 2024 10:27:06 -0700
Subject: [PATCH 05/17] query size instead of calculation

Change-Id: I8eeacb7d4292a215bb0540e8e7dd12ab7547d058
---
 .../AMDGPU/AMDGPULateCodeGenPrepare.cpp       | 31 +++++--------------
 1 file changed, 7 insertions(+), 24 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
index 070fefc31132c..5276598efeb6f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
@@ -329,8 +329,7 @@ Type *LiveRegOptimizer::getCompatibleType(Instruction *InstToConvert) {
   if (!VTy)
     return ConvertToScalar;
 
-  unsigned OriginalSize =
-      VTy->getScalarSizeInBits() * VTy->getElementCount().getFixedValue();
+  unsigned OriginalSize = VTy->getPrimitiveSizeInBits();
   unsigned ConvertScalarSize = ConvertToScalar->getScalarSizeInBits();
   unsigned ConvertEltCount =
       (OriginalSize + ConvertScalarSize - 1) / ConvertScalarSize;
@@ -349,21 +348,13 @@ void LiveRegOptimizer::convertToOptType(LiveRegConversion &LR) {
   }
 
   VectorType *VTy = cast<VectorType>(LR.getOriginalType());
-
   Type *NewTy = LR.getNewType();
-  assert(NewTy);
-  VectorType *NewVTy = NewTy->isVectorTy() ? cast<VectorType>(NewTy) : nullptr;
 
-  Value *V = static_cast<Value *>(LR.getLiveRegDef());
-  unsigned OriginalSize =
-      VTy->getScalarSizeInBits() * VTy->getElementCount().getFixedValue();
-  unsigned NewSize = NewTy->isVectorTy()
-                     ? NewVTy->getScalarSizeInBits() *
-                           NewVTy->getElementCount().getFixedValue()
-                     : NewTy->getScalarSizeInBits();
+  unsigned OriginalSize = VTy->getPrimitiveSizeInBits();
+  unsigned NewSize = NewTy->getPrimitiveSizeInBits();
 
   auto &Builder = LR.getConvertBuilder();
-
+  Value *V = static_cast<Value *>(LR.getLiveRegDef());
   // If there is a bitsize match, we can fit the old vector into a new vector of
   // desired type
   if (OriginalSize == NewSize) {
@@ -397,21 +388,13 @@ void LiveRegOptimizer::convertToOptType(LiveRegConversion &LR) {
 
 void LiveRegOptimizer::convertFromOptType(LiveRegConversion &LRC) {
   Type *OTy = LRC.getOriginalType();
-  VectorType *VTy =
-      OTy->isVectorTy() ? dyn_cast<VectorType>(LRC.getOriginalType()) : nullptr;
-
   VectorType *NewVTy = cast<VectorType>(LRC.getNewType());
 
-  Value *V = static_cast<Value *>(LRC.getLiveRegDef());
-  unsigned OriginalSize =
-      OTy->isVectorTy()
-          ? VTy->getScalarSizeInBits() * VTy->getElementCount().getFixedValue()
-          : OTy->getScalarSizeInBits();
-  unsigned NewSize =
-      NewVTy->getScalarSizeInBits() * NewVTy->getElementCount().getFixedValue();
+  unsigned OriginalSize = OTy->getPrimitiveSizeInBits();
+  unsigned NewSize = NewVTy->getPrimitiveSizeInBits();
 
   auto &Builder = LRC.getConvertBuilder();
-
+  Value *V = static_cast<Value *>(LRC.getLiveRegDef());
   // If there is a bitsize match, we simply convert back to the original type
   if (OriginalSize == NewSize) {
     LRC.setConverted(dyn_cast<Instruction>(Builder.CreateBitCast(V, NewVTy)));

>From 95ee7a5fe04956a91daccf2d1a74a513a4273eb7 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Fri, 19 Apr 2024 10:36:59 -0700
Subject: [PATCH 06/17] rename LiveRegConversion

Change-Id: I94504f26819c45de7496b39fee8031bcda0f29fb
---
 .../AMDGPU/AMDGPULateCodeGenPrepare.cpp       | 72 ++++++++++---------
 1 file changed, 37 insertions(+), 35 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
index 5276598efeb6f..31eedcfe6dee9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
@@ -81,7 +81,7 @@ class AMDGPULateCodeGenPrepare
   bool visitLoadInst(LoadInst &LI);
 };
 
-class LiveRegConversion {
+class ConversionCandidateInfo {
 private:
   // The instruction which defined the original virtual register used across
   // blocks
@@ -115,12 +115,13 @@ class LiveRegConversion {
   // register
   bool hasConverted() { return Converted != nullptr; }
 
-  LiveRegConversion(Instruction *LiveRegDef, BasicBlock *InsertBlock,
-                    BasicBlock::iterator InsertPt)
+  ConversionCandidateInfo(Instruction *LiveRegDef, BasicBlock *InsertBlock,
+                          BasicBlock::iterator InsertPt)
       : LiveRegDef(LiveRegDef), OriginalType(LiveRegDef->getType()),
         ConvertBuilder(InsertBlock, InsertPt) {}
-  LiveRegConversion(Instruction *LiveRegDef, Type *NewType,
-                    BasicBlock *InsertBlock, BasicBlock::iterator InsertPt)
+  ConversionCandidateInfo(Instruction *LiveRegDef, Type *NewType,
+                          BasicBlock *InsertBlock,
+                          BasicBlock::iterator InsertPt)
       : LiveRegDef(LiveRegDef), OriginalType(LiveRegDef->getType()),
         NewType(NewType), ConvertBuilder(InsertBlock, InsertPt) {}
 };
@@ -140,10 +141,10 @@ class LiveRegOptimizer {
   // Should the def of the instruction be converted if it is live across blocks
   bool shouldReplaceUses(const Instruction &I);
   // Convert the virtual register to the compatible vector of legal type
-  void convertToOptType(LiveRegConversion &LR);
+  void convertToOptType(ConversionCandidateInfo &LR);
   // Convert the virtual register back to the original type, stripping away
   // the MSBs in cases where there was an imperfect fit (e.g. v2i32 -> v7i8)
-  void convertFromOptType(LiveRegConversion &LR);
+  void convertFromOptType(ConversionCandidateInfo &LR);
   // Get a vector of desired scalar type that is compatible with the original
   // vector. In cases where there is no bitsize equivalent using a legal vector
   // type, we pad the MSBs (e.g. v7i8 -> v2i32)
@@ -213,21 +214,21 @@ bool LiveRegOptimizer::replaceUses(Instruction &I) {
     Instruction *Converted;
     SmallVector<Instruction *, 4> Users;
   };
-  DenseMap<BasicBlock *, ConvertUseInfo> UseConvertTracker;
+  DenseMap<BasicBlock *, ConvertUseInfo> InsertedConversionMap;
 
-  LiveRegConversion FromLRC(
+  ConversionCandidateInfo FromCCI(
       &I, I.getParent(),
       static_cast<BasicBlock::iterator>(std::next(I.getIterator())));
-  FromLRC.setNewType(getCompatibleType(FromLRC.getLiveRegDef()));
+  FromCCI.setNewType(getCompatibleType(FromCCI.getLiveRegDef()));
   for (auto IUser = I.user_begin(); IUser != I.user_end(); IUser++) {
 
     if (Instruction *UserInst = dyn_cast<Instruction>(*IUser)) {
       if (UserInst->getParent() != I.getParent() || isa<PHINode>(UserInst)) {
         LLVM_DEBUG(dbgs() << *UserInst << "\n\tUses "
-                          << *FromLRC.getOriginalType()
+                          << *FromCCI.getOriginalType()
                           << " from previous block. Needs conversion\n");
-        convertToOptType(FromLRC);
-        if (!FromLRC.hasConverted())
+        convertToOptType(FromCCI);
+        if (!FromCCI.hasConverted())
           continue;
         // If it is a PHI node, just create and collect the new operand. We can
         // only replace the PHI node once we have converted all the operands
@@ -246,9 +247,9 @@ bool LiveRegOptimizer::replaceUses(Instruction &I) {
 
               if (PHIOps == PHIUpdater.end())
                 PHIUpdater.push_back(
-                    {UserInst, {{FromLRC.getConverted(), IncBlock}}});
+                    {UserInst, {{FromCCI.getConverted(), IncBlock}}});
               else
-                PHIOps->second.push_back({FromLRC.getConverted(), IncBlock});
+                PHIOps->second.push_back({FromCCI.getConverted(), IncBlock});
 
               break;
             }
@@ -258,27 +259,28 @@ bool LiveRegOptimizer::replaceUses(Instruction &I) {
 
         // Do not create multiple conversion sequences if there are multiple
         // uses in the same block
-        if (UseConvertTracker.contains(UserInst->getParent())) {
-          UseConvertTracker[UserInst->getParent()].Users.push_back(UserInst);
+        if (InsertedConversionMap.contains(UserInst->getParent())) {
+          InsertedConversionMap[UserInst->getParent()].Users.push_back(
+              UserInst);
           LLVM_DEBUG(dbgs() << "\tUser already has access to converted def\n");
           continue;
         }
 
-        LiveRegConversion ToLRC(FromLRC.getConverted(), I.getType(),
-                                UserInst->getParent(),
-                                static_cast<BasicBlock::iterator>(
-                                    UserInst->getParent()->getFirstNonPHIIt()));
-        convertFromOptType(ToLRC);
-        assert(ToLRC.hasConverted());
-        UseConvertTracker[UserInst->getParent()] = {ToLRC.getConverted(),
-                                                    {UserInst}};
+        ConversionCandidateInfo ToCCI(
+            FromCCI.getConverted(), I.getType(), UserInst->getParent(),
+            static_cast<BasicBlock::iterator>(
+                UserInst->getParent()->getFirstNonPHIIt()));
+        convertFromOptType(ToCCI);
+        assert(ToCCI.hasConverted());
+        InsertedConversionMap[UserInst->getParent()] = {ToCCI.getConverted(),
+                                                        {UserInst}};
       }
     }
   }
 
   // Replace uses of with in a separate loop that is not dependent upon the
   // state of the uses
-  for (auto &Entry : UseConvertTracker) {
+  for (auto &Entry : InsertedConversionMap) {
     for (auto &UserInst : Entry.second.Users) {
       LLVM_DEBUG(dbgs() << *UserInst
                         << "\n\tNow uses: " << *Entry.second.Converted << "\n");
@@ -306,13 +308,13 @@ bool LiveRegOptimizer::replacePHIs() {
                           << "  For: " << IncVals.second->getName() << "\n");
       }
       LLVM_DEBUG(dbgs() << "Sucessfully replaced with " << *NPHI << "\n");
-      LiveRegConversion ToLRC(NPHI, ThePHINode->getType(),
-                              ThePHINode->getParent(),
-                              static_cast<BasicBlock::iterator>(
-                                  ThePHINode->getParent()->getFirstNonPHIIt()));
-      convertFromOptType(ToLRC);
-      assert(ToLRC.hasConverted());
-      Ele.first->replaceAllUsesWith(ToLRC.getConverted());
+      ConversionCandidateInfo ToCCI(
+          NPHI, ThePHINode->getType(), ThePHINode->getParent(),
+          static_cast<BasicBlock::iterator>(
+              ThePHINode->getParent()->getFirstNonPHIIt()));
+      convertFromOptType(ToCCI);
+      assert(ToCCI.hasConverted());
+      Ele.first->replaceAllUsesWith(ToCCI.getConverted());
       // The old PHI is no longer used
       ThePHINode->eraseFromParent();
       MadeChange = true;
@@ -341,7 +343,7 @@ Type *LiveRegOptimizer::getCompatibleType(Instruction *InstToConvert) {
                          llvm::ElementCount::getFixed(ConvertEltCount));
 }
 
-void LiveRegOptimizer::convertToOptType(LiveRegConversion &LR) {
+void LiveRegOptimizer::convertToOptType(ConversionCandidateInfo &LR) {
   if (LR.hasConverted()) {
     LLVM_DEBUG(dbgs() << "\tAlready has converted def\n");
     return;
@@ -386,7 +388,7 @@ void LiveRegOptimizer::convertToOptType(LiveRegConversion &LR) {
   return;
 }
 
-void LiveRegOptimizer::convertFromOptType(LiveRegConversion &LRC) {
+void LiveRegOptimizer::convertFromOptType(ConversionCandidateInfo &LRC) {
   Type *OTy = LRC.getOriginalType();
   VectorType *NewVTy = cast<VectorType>(LRC.getNewType());
 

>From 7fe461c3f49902b638bc4fd01ccd7d0f97ff9f53 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Fri, 19 Apr 2024 10:46:58 -0700
Subject: [PATCH 07/17] simplify initialization of shufflemask vector

Change-Id: I4383004240dc0365de6e67b12dc9ea5b609826d2
---
 llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
index 31eedcfe6dee9..d0e0977d7bb4e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
@@ -427,9 +427,8 @@ void LiveRegOptimizer::convertFromOptType(ConversionCandidateInfo &LRC) {
       Builder.CreateBitCast(LRC.getLiveRegDef(), ExpandedVT));
 
   unsigned NarrowElementCount = NewVTy->getElementCount().getFixedValue();
-  SmallVector<int, 8> ShuffleMask;
-  for (uint64_t I = 0; I < NarrowElementCount; I++)
-    ShuffleMask.push_back(I);
+  SmallVector<int, 8> ShuffleMask(NarrowElementCount);
+  std::iota(ShuffleMask.begin(), ShuffleMask.end(), 0);
 
   Instruction *NarrowVec = dyn_cast<Instruction>(
       Builder.CreateShuffleVector(Converted, ShuffleMask));

>From 09882483cc3922a797bc43440f52898c19817f39 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Fri, 19 Apr 2024 10:56:36 -0700
Subject: [PATCH 08/17] precommit global-isel tests

Change-Id: I07bf0cf4537bd3b148dc4ee3b785b989f0aac8b0
---
 .../AMDGPU/GlobalISel/vni8-across-blocks.ll   | 2120 +++++++++++++++++
 .../AMDGPU/GlobalISel/vni8-loop-carried.ll    |   67 +
 2 files changed, 2187 insertions(+)
 create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-loop-carried.ll

diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll
new file mode 100644
index 0000000000000..cbb8fede31efa
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll
@@ -0,0 +1,2120 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx906 < %s | FileCheck --check-prefix=GFX906 %s
+
+define amdgpu_kernel void @v3i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
+; GFX906-LABEL: v3i8_liveout:
+; GFX906:       ; %bb.0: ; %entry
+; GFX906-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX906-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX906-NEXT:    v_lshlrev_b32_e32 v4, 2, v0
+; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v0
+; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX906-NEXT:    global_load_dword v1, v4, s[4:5]
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v1
+; GFX906-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GFX906-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX906-NEXT:    s_cbranch_execz .LBB0_2
+; GFX906-NEXT:  ; %bb.1: ; %bb.1
+; GFX906-NEXT:    global_load_dword v1, v4, s[6:7]
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v1
+; GFX906-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GFX906-NEXT:  .LBB0_2: ; %bb.2
+; GFX906-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX906-NEXT:    v_and_b32_e32 v0, 0xff, v3
+; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
+; GFX906-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    s_mov_b32 s0, 0xffff
+; GFX906-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX906-NEXT:    v_and_b32_sdwa v1, s0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX906-NEXT:    v_mov_b32_e32 v1, 0
+; GFX906-NEXT:    global_store_short v1, v0, s[2:3]
+; GFX906-NEXT:    global_store_byte_d16_hi v1, v0, s[2:3] offset:2
+; GFX906-NEXT:    s_endpgm
+entry:
+  %idx = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep1 = getelementptr <3 x i8>, ptr addrspace(1) %src1, i32 %idx
+  %vec1 = load <3 x i8>, ptr addrspace(1) %gep1
+  %gep2 = getelementptr <3 x i8>, ptr addrspace(1) %src2, i32 %idx
+  %vec2 = load <3 x i8>, ptr addrspace(1) %gep2
+  %cmp = icmp ult i32 %idx, 15
+  br i1 %cmp, label %bb.1, label %bb.2
+bb.1:
+  br label %bb.2
+
+bb.2:
+  %tmp5 = phi <3 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ]
+  store <3 x i8> %tmp5, ptr addrspace(1) %dst, align 4
+  ret void
+}
+
+define amdgpu_kernel void @v4i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
+; GFX906-LABEL: v4i8_liveout:
+; GFX906:       ; %bb.0: ; %entry
+; GFX906-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX906-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 2, v0
+; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v0
+; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX906-NEXT:    global_load_dword v1, v5, s[4:5]
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v2, 8, v1
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 24, v1
+; GFX906-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX906-NEXT:    s_cbranch_execz .LBB1_2
+; GFX906-NEXT:  ; %bb.1: ; %bb.1
+; GFX906-NEXT:    global_load_dword v1, v5, s[6:7]
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v2, 8, v1
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 24, v1
+; GFX906-NEXT:  .LBB1_2: ; %bb.2
+; GFX906-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX906-NEXT:    v_mov_b32_e32 v5, 8
+; GFX906-NEXT:    v_mov_b32_e32 v0, 0xff
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    v_and_or_b32 v0, v1, v0, v2
+; GFX906-NEXT:    v_and_b32_e32 v1, 0xff, v3
+; GFX906-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX906-NEXT:    v_lshlrev_b32_e32 v2, 24, v4
+; GFX906-NEXT:    v_or3_b32 v0, v0, v1, v2
+; GFX906-NEXT:    v_mov_b32_e32 v1, 0
+; GFX906-NEXT:    global_store_dword v1, v0, s[2:3]
+; GFX906-NEXT:    s_endpgm
+entry:
+  %idx = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
+  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
+  %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
+  %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
+  %cmp = icmp ult i32 %idx, 15
+  br i1 %cmp, label %bb.1, label %bb.2
+bb.1:
+  br label %bb.2
+
+bb.2:
+  %tmp5 = phi <4 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ]
+  store <4 x i8> %tmp5, ptr addrspace(1) %dst, align 4
+  ret void
+}
+
+define amdgpu_kernel void @v5i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
+; GFX906-LABEL: v5i8_liveout:
+; GFX906:       ; %bb.0: ; %entry
+; GFX906-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX906-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX906-NEXT:    v_lshlrev_b32_e32 v6, 3, v0
+; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v0
+; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX906-NEXT:    global_load_dwordx2 v[1:2], v6, s[4:5]
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v1
+; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
+; GFX906-NEXT:    v_lshrrev_b32_e32 v5, 24, v1
+; GFX906-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX906-NEXT:    s_cbranch_execz .LBB2_2
+; GFX906-NEXT:  ; %bb.1: ; %bb.1
+; GFX906-NEXT:    global_load_dwordx2 v[1:2], v6, s[6:7]
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v1
+; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
+; GFX906-NEXT:    v_lshrrev_b32_e32 v5, 24, v1
+; GFX906-NEXT:  .LBB2_2: ; %bb.2
+; GFX906-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX906-NEXT:    v_mov_b32_e32 v0, 0
+; GFX906-NEXT:    global_store_byte v0, v1, s[2:3]
+; GFX906-NEXT:    global_store_byte v0, v3, s[2:3] offset:1
+; GFX906-NEXT:    global_store_byte v0, v4, s[2:3] offset:2
+; GFX906-NEXT:    global_store_byte v0, v5, s[2:3] offset:3
+; GFX906-NEXT:    global_store_byte v0, v2, s[2:3] offset:4
+; GFX906-NEXT:    s_endpgm
+entry:
+  %idx = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep1 = getelementptr <5 x i8>, ptr addrspace(1) %src1, i32 %idx
+  %vec1 = load <5 x i8>, ptr addrspace(1) %gep1
+  %gep2 = getelementptr <5 x i8>, ptr addrspace(1) %src2, i32 %idx
+  %vec2 = load <5 x i8>, ptr addrspace(1) %gep2
+  %cmp = icmp ult i32 %idx, 15
+  br i1 %cmp, label %bb.1, label %bb.2
+bb.1:
+  br label %bb.2
+
+bb.2:
+  %tmp5 = phi <5 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ]
+  store <5 x i8> %tmp5, ptr addrspace(1) %dst, align 4
+  ret void
+}
+
+define amdgpu_kernel void @v8i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
+; GFX906-LABEL: v8i8_liveout:
+; GFX906:       ; %bb.0: ; %entry
+; GFX906-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX906-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX906-NEXT:    v_lshlrev_b32_e32 v9, 3, v0
+; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v0
+; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX906-NEXT:    global_load_dwordx2 v[1:2], v9, s[4:5]
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v1
+; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
+; GFX906-NEXT:    v_lshrrev_b32_e32 v5, 24, v1
+; GFX906-NEXT:    v_lshrrev_b32_e32 v6, 8, v2
+; GFX906-NEXT:    v_lshrrev_b32_e32 v7, 16, v2
+; GFX906-NEXT:    v_lshrrev_b32_e32 v8, 24, v2
+; GFX906-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX906-NEXT:    s_cbranch_execz .LBB3_2
+; GFX906-NEXT:  ; %bb.1: ; %bb.1
+; GFX906-NEXT:    global_load_dwordx2 v[1:2], v9, s[6:7]
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v1
+; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
+; GFX906-NEXT:    v_lshrrev_b32_e32 v5, 24, v1
+; GFX906-NEXT:    v_lshrrev_b32_e32 v6, 8, v2
+; GFX906-NEXT:    v_lshrrev_b32_e32 v7, 16, v2
+; GFX906-NEXT:    v_lshrrev_b32_e32 v8, 24, v2
+; GFX906-NEXT:  .LBB3_2: ; %bb.2
+; GFX906-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX906-NEXT:    v_mov_b32_e32 v10, 8
+; GFX906-NEXT:    v_mov_b32_e32 v9, 0xff
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v0, v10, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    v_and_or_b32 v0, v1, v9, v0
+; GFX906-NEXT:    v_and_b32_e32 v1, 0xff, v4
+; GFX906-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX906-NEXT:    v_lshlrev_b32_e32 v3, 24, v5
+; GFX906-NEXT:    v_or3_b32 v0, v0, v1, v3
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v1, v10, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    v_and_or_b32 v1, v2, v9, v1
+; GFX906-NEXT:    v_and_b32_e32 v2, 0xff, v7
+; GFX906-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX906-NEXT:    v_lshlrev_b32_e32 v3, 24, v8
+; GFX906-NEXT:    v_or3_b32 v1, v1, v2, v3
+; GFX906-NEXT:    v_mov_b32_e32 v2, 0
+; GFX906-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX906-NEXT:    s_endpgm
+entry:
+  %idx = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx
+  %vec1 = load <8 x i8>, ptr addrspace(1) %gep1
+  %gep2 = getelementptr <8 x i8>, ptr addrspace(1) %src2, i32 %idx
+  %vec2 = load <8 x i8>, ptr addrspace(1) %gep2
+  %cmp = icmp ult i32 %idx, 15
+  br i1 %cmp, label %bb.1, label %bb.2
+bb.1:
+  br label %bb.2
+
+bb.2:
+  %tmp5 = phi <8 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ]
+  store <8 x i8> %tmp5, ptr addrspace(1) %dst, align 4
+  ret void
+}
+
+define amdgpu_kernel void @v16i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
+; GFX906-LABEL: v16i8_liveout:
+; GFX906:       ; %bb.0: ; %entry
+; GFX906-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX906-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX906-NEXT:    v_lshlrev_b32_e32 v17, 4, v0
+; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v0
+; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX906-NEXT:    global_load_dwordx4 v[1:4], v17, s[4:5]
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
+; GFX906-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
+; GFX906-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
+; GFX906-NEXT:    v_lshrrev_b32_e32 v8, 8, v2
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v2
+; GFX906-NEXT:    v_lshrrev_b32_e32 v10, 24, v2
+; GFX906-NEXT:    v_lshrrev_b32_e32 v11, 8, v3
+; GFX906-NEXT:    v_lshrrev_b32_e32 v12, 16, v3
+; GFX906-NEXT:    v_lshrrev_b32_e32 v13, 24, v3
+; GFX906-NEXT:    v_lshrrev_b32_e32 v14, 8, v4
+; GFX906-NEXT:    v_lshrrev_b32_e32 v15, 16, v4
+; GFX906-NEXT:    v_lshrrev_b32_e32 v16, 24, v4
+; GFX906-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX906-NEXT:    s_cbranch_execz .LBB4_2
+; GFX906-NEXT:  ; %bb.1: ; %bb.1
+; GFX906-NEXT:    global_load_dwordx4 v[1:4], v17, s[6:7]
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
+; GFX906-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
+; GFX906-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
+; GFX906-NEXT:    v_lshrrev_b32_e32 v8, 8, v2
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v2
+; GFX906-NEXT:    v_lshrrev_b32_e32 v10, 24, v2
+; GFX906-NEXT:    v_lshrrev_b32_e32 v11, 8, v3
+; GFX906-NEXT:    v_lshrrev_b32_e32 v12, 16, v3
+; GFX906-NEXT:    v_lshrrev_b32_e32 v13, 24, v3
+; GFX906-NEXT:    v_lshrrev_b32_e32 v14, 8, v4
+; GFX906-NEXT:    v_lshrrev_b32_e32 v15, 16, v4
+; GFX906-NEXT:    v_lshrrev_b32_e32 v16, 24, v4
+; GFX906-NEXT:  .LBB4_2: ; %bb.2
+; GFX906-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX906-NEXT:    v_mov_b32_e32 v18, 8
+; GFX906-NEXT:    v_mov_b32_e32 v17, 0xff
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v0, v18, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    v_and_or_b32 v0, v1, v17, v0
+; GFX906-NEXT:    v_and_b32_e32 v1, 0xff, v6
+; GFX906-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 24, v7
+; GFX906-NEXT:    v_or3_b32 v0, v0, v1, v5
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v1, v18, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    v_and_or_b32 v1, v2, v17, v1
+; GFX906-NEXT:    v_and_b32_e32 v2, 0xff, v9
+; GFX906-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 24, v10
+; GFX906-NEXT:    v_or3_b32 v1, v1, v2, v5
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v2, v18, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    v_and_or_b32 v2, v3, v17, v2
+; GFX906-NEXT:    v_and_b32_e32 v3, 0xff, v12
+; GFX906-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 24, v13
+; GFX906-NEXT:    v_or3_b32 v2, v2, v3, v5
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v3, v18, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    v_and_or_b32 v3, v4, v17, v3
+; GFX906-NEXT:    v_and_b32_e32 v4, 0xff, v15
+; GFX906-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 24, v16
+; GFX906-NEXT:    v_or3_b32 v3, v3, v4, v5
+; GFX906-NEXT:    v_mov_b32_e32 v4, 0
+; GFX906-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
+; GFX906-NEXT:    s_endpgm
+entry:
+  %idx = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep1 = getelementptr <16 x i8>, ptr addrspace(1) %src1, i32 %idx
+  %vec1 = load <16 x i8>, ptr addrspace(1) %gep1
+  %gep2 = getelementptr <16 x i8>, ptr addrspace(1) %src2, i32 %idx
+  %vec2 = load <16 x i8>, ptr addrspace(1) %gep2
+  %cmp = icmp ult i32 %idx, 15
+  br i1 %cmp, label %bb.1, label %bb.2
+bb.1:
+  br label %bb.2
+
+bb.2:
+  %tmp5 = phi <16 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ]
+  store <16 x i8> %tmp5, ptr addrspace(1) %dst, align 4
+  ret void
+}
+
+define amdgpu_kernel void @v32i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
+; GFX906-LABEL: v32i8_liveout:
+; GFX906:       ; %bb.0: ; %entry
+; GFX906-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX906-NEXT:    v_lshlrev_b32_e32 v32, 5, v0
+; GFX906-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v0
+; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX906-NEXT:    global_load_dwordx4 v[5:8], v32, s[4:5]
+; GFX906-NEXT:    global_load_dwordx4 v[1:4], v32, s[4:5] offset:16
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v31, 8, v5
+; GFX906-NEXT:    v_lshrrev_b32_e32 v26, 16, v5
+; GFX906-NEXT:    v_lshrrev_b32_e32 v27, 24, v5
+; GFX906-NEXT:    v_lshrrev_b32_e32 v28, 8, v6
+; GFX906-NEXT:    v_lshrrev_b32_e32 v29, 16, v6
+; GFX906-NEXT:    v_lshrrev_b32_e32 v30, 24, v6
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v7
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v7
+; GFX906-NEXT:    v_lshrrev_b32_e32 v10, 24, v7
+; GFX906-NEXT:    v_lshrrev_b32_e32 v11, 8, v8
+; GFX906-NEXT:    v_lshrrev_b32_e32 v12, 16, v8
+; GFX906-NEXT:    v_lshrrev_b32_e32 v13, 24, v8
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v14, 8, v1
+; GFX906-NEXT:    v_lshrrev_b32_e32 v15, 16, v1
+; GFX906-NEXT:    v_lshrrev_b32_e32 v16, 24, v1
+; GFX906-NEXT:    v_lshrrev_b32_e32 v17, 8, v2
+; GFX906-NEXT:    v_lshrrev_b32_e32 v18, 16, v2
+; GFX906-NEXT:    v_lshrrev_b32_e32 v19, 24, v2
+; GFX906-NEXT:    v_lshrrev_b32_e32 v20, 8, v3
+; GFX906-NEXT:    v_lshrrev_b32_e32 v21, 16, v3
+; GFX906-NEXT:    v_lshrrev_b32_e32 v22, 24, v3
+; GFX906-NEXT:    v_lshrrev_b32_e32 v23, 8, v4
+; GFX906-NEXT:    v_lshrrev_b32_e32 v24, 16, v4
+; GFX906-NEXT:    v_lshrrev_b32_e32 v25, 24, v4
+; GFX906-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX906-NEXT:    s_cbranch_execz .LBB5_2
+; GFX906-NEXT:  ; %bb.1: ; %bb.1
+; GFX906-NEXT:    global_load_dwordx4 v[5:8], v32, s[6:7]
+; GFX906-NEXT:    global_load_dwordx4 v[1:4], v32, s[6:7] offset:16
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v31, 8, v5
+; GFX906-NEXT:    v_lshrrev_b32_e32 v26, 16, v5
+; GFX906-NEXT:    v_lshrrev_b32_e32 v27, 24, v5
+; GFX906-NEXT:    v_lshrrev_b32_e32 v28, 8, v6
+; GFX906-NEXT:    v_lshrrev_b32_e32 v29, 16, v6
+; GFX906-NEXT:    v_lshrrev_b32_e32 v30, 24, v6
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v7
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v7
+; GFX906-NEXT:    v_lshrrev_b32_e32 v10, 24, v7
+; GFX906-NEXT:    v_lshrrev_b32_e32 v11, 8, v8
+; GFX906-NEXT:    v_lshrrev_b32_e32 v12, 16, v8
+; GFX906-NEXT:    v_lshrrev_b32_e32 v13, 24, v8
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v14, 8, v1
+; GFX906-NEXT:    v_lshrrev_b32_e32 v15, 16, v1
+; GFX906-NEXT:    v_lshrrev_b32_e32 v16, 24, v1
+; GFX906-NEXT:    v_lshrrev_b32_e32 v17, 8, v2
+; GFX906-NEXT:    v_lshrrev_b32_e32 v18, 16, v2
+; GFX906-NEXT:    v_lshrrev_b32_e32 v19, 24, v2
+; GFX906-NEXT:    v_lshrrev_b32_e32 v20, 8, v3
+; GFX906-NEXT:    v_lshrrev_b32_e32 v21, 16, v3
+; GFX906-NEXT:    v_lshrrev_b32_e32 v22, 24, v3
+; GFX906-NEXT:    v_lshrrev_b32_e32 v23, 8, v4
+; GFX906-NEXT:    v_lshrrev_b32_e32 v24, 16, v4
+; GFX906-NEXT:    v_lshrrev_b32_e32 v25, 24, v4
+; GFX906-NEXT:  .LBB5_2: ; %bb.2
+; GFX906-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX906-NEXT:    v_mov_b32_e32 v32, 8
+; GFX906-NEXT:    v_mov_b32_e32 v33, 0xff
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v0, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    v_and_or_b32 v0, v7, v33, v0
+; GFX906-NEXT:    v_and_b32_e32 v7, 0xff, v9
+; GFX906-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX906-NEXT:    v_lshlrev_b32_e32 v9, 24, v10
+; GFX906-NEXT:    v_or3_b32 v7, v0, v7, v9
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v0, v32, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    v_and_or_b32 v0, v8, v33, v0
+; GFX906-NEXT:    v_and_b32_e32 v8, 0xff, v12
+; GFX906-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX906-NEXT:    v_lshlrev_b32_e32 v9, 24, v13
+; GFX906-NEXT:    v_or3_b32 v8, v0, v8, v9
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v0, v32, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    v_and_or_b32 v0, v1, v33, v0
+; GFX906-NEXT:    v_and_b32_e32 v1, 0xff, v15
+; GFX906-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX906-NEXT:    v_lshlrev_b32_e32 v9, 24, v16
+; GFX906-NEXT:    v_or3_b32 v0, v0, v1, v9
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v1, v32, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    v_and_or_b32 v1, v2, v33, v1
+; GFX906-NEXT:    v_and_b32_e32 v2, 0xff, v18
+; GFX906-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX906-NEXT:    v_lshlrev_b32_e32 v9, 24, v19
+; GFX906-NEXT:    v_or3_b32 v1, v1, v2, v9
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v2, v32, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    v_and_or_b32 v2, v3, v33, v2
+; GFX906-NEXT:    v_and_b32_e32 v3, 0xff, v21
+; GFX906-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX906-NEXT:    v_lshlrev_b32_e32 v9, 24, v22
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v28, v32, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    v_or3_b32 v2, v2, v3, v9
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v3, v32, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v31, v32, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    v_and_b32_e32 v26, 0xff, v26
+; GFX906-NEXT:    v_and_or_b32 v6, v6, v33, v28
+; GFX906-NEXT:    v_and_b32_e32 v28, 0xff, v29
+; GFX906-NEXT:    v_and_or_b32 v3, v4, v33, v3
+; GFX906-NEXT:    v_and_b32_e32 v4, 0xff, v24
+; GFX906-NEXT:    v_and_or_b32 v5, v5, v33, v31
+; GFX906-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
+; GFX906-NEXT:    v_lshlrev_b32_e32 v27, 24, v27
+; GFX906-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
+; GFX906-NEXT:    v_lshlrev_b32_e32 v29, 24, v30
+; GFX906-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX906-NEXT:    v_lshlrev_b32_e32 v9, 24, v25
+; GFX906-NEXT:    v_or3_b32 v5, v5, v26, v27
+; GFX906-NEXT:    v_or3_b32 v6, v6, v28, v29
+; GFX906-NEXT:    v_or3_b32 v3, v3, v4, v9
+; GFX906-NEXT:    v_mov_b32_e32 v4, 0
+; GFX906-NEXT:    global_store_dwordx4 v4, v[5:8], s[0:1]
+; GFX906-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
+; GFX906-NEXT:    s_endpgm
+entry:
+  %idx = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep1 = getelementptr <32 x i8>, ptr addrspace(1) %src1, i32 %idx
+  %vec1 = load <32 x i8>, ptr addrspace(1) %gep1
+  %gep2 = getelementptr <32 x i8>, ptr addrspace(1) %src2, i32 %idx
+  %vec2 = load <32 x i8>, ptr addrspace(1) %gep2
+  %cmp = icmp ult i32 %idx, 15
+  br i1 %cmp, label %bb.1, label %bb.2
+bb.1:
+  br label %bb.2
+
+bb.2:
+  %tmp5 = phi <32 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ]
+  store <32 x i8> %tmp5, ptr addrspace(1) %dst, align 4
+  ret void
+}
+
+define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
+; GFX906-LABEL: v256i8_liveout:
+; GFX906:       ; %bb.0: ; %entry
+; GFX906-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX906-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX906-NEXT:    s_mov_b32 s10, -1
+; GFX906-NEXT:    s_mov_b32 s11, 0xe00000
+; GFX906-NEXT:    s_add_u32 s8, s8, s3
+; GFX906-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX906-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX906-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
+; GFX906-NEXT:    s_addc_u32 s9, s9, 0
+; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX906-NEXT:    global_load_dwordx4 v[5:8], v4, s[4:5]
+; GFX906-NEXT:    global_load_dwordx4 v[9:12], v4, s[4:5] offset:16
+; GFX906-NEXT:    global_load_dwordx4 v[29:32], v4, s[4:5] offset:32
+; GFX906-NEXT:    global_load_dwordx4 v[13:16], v4, s[4:5] offset:48
+; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v0
+; GFX906-NEXT:    s_waitcnt vmcnt(3)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v5
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:780 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v5
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:692 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v6
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:768 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v6
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:712 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v6
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:716 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v7
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:772 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v7
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:736 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v7
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:740 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v8
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:776 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v8
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:744 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v8
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:748 ; 4-byte Folded Spill
+; GFX906-NEXT:    s_waitcnt vmcnt(13)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v9
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:696 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v9
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:564 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v9
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:568 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v10
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:700 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v10
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:572 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v10
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:576 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v11
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:704 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v11
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:588 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v11
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:592 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v12
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:708 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v12
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:596 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:16 ; 4-byte Folded Spill
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    buffer_store_dword v10, off, s[8:11], 0 offset:20 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v11, off, s[8:11], 0 offset:24 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v12, off, s[8:11], 0 offset:28 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v12
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:600 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v29
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:752 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v29
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:636 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v29
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:640 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v30
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:756 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v30
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:668 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v30
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:672 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v31
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:760 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v31
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:676 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v31
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:680 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v32
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:764 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v32
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:684 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v32
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:688 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v13
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:652 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v13
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:500 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v13
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:504 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v14
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:656 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v14
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:524 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v14
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:528 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v15
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:660 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v15
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:548 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v15
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:552 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v16
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:664 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v16
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:556 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v16
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:560 ; 4-byte Folded Spill
+; GFX906-NEXT:    global_load_dwordx4 v[0:3], v4, s[4:5] offset:64
+; GFX906-NEXT:    s_nop 0
+; GFX906-NEXT:    global_load_dwordx4 v[45:48], v4, s[4:5] offset:80
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v0
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:720 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v0
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:580 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v0
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:584 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v1
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:724 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:604 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v1
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:608 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v2
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:728 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v2
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:628 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v2
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:632 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v3
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:732 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v3
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:644 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v3
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:648 ; 4-byte Folded Spill
+; GFX906-NEXT:    s_waitcnt vmcnt(12)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v45
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:612 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v45
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:468 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v45
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:472 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v46
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:616 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v46
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:476 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v46
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:480 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v47
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:620 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v47
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:508 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v47
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:512 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v48
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:624 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v48
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:516 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v48
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:520 ; 4-byte Folded Spill
+; GFX906-NEXT:    global_load_dwordx4 v[57:60], v4, s[4:5] offset:96
+; GFX906-NEXT:    global_load_dwordx4 v[41:44], v4, s[4:5] offset:112
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v57
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:532 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v57
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:404 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v57
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:408 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v58
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:536 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v58
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:428 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v58
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:432 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v59
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:540 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v59
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:452 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v59
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:456 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v60
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:544 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v60
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:460 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v60
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:464 ; 4-byte Folded Spill
+; GFX906-NEXT:    s_waitcnt vmcnt(12)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v41
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:436 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v41
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:308 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v41
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:312 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v42
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:440 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v42
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:332 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v42
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:336 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v43
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:444 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v43
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:356 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v43
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:360 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v44
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:448 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v44
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:364 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v44
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:368 ; 4-byte Folded Spill
+; GFX906-NEXT:    global_load_dwordx4 v[53:56], v4, s[4:5] offset:128
+; GFX906-NEXT:    global_load_dwordx4 v[25:28], v4, s[4:5] offset:144
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v53
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:484 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v53
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:372 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v53
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:376 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v54
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:488 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v54
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:380 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v54
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:384 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v55
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:492 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v55
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:412 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v55
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:416 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v56
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:496 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v56
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:420 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v56
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:424 ; 4-byte Folded Spill
+; GFX906-NEXT:    s_waitcnt vmcnt(12)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v25
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:388 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v25
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:276 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v25
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:280 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v26
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:392 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v26
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:284 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v26
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:288 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v27
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:396 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v27
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:316 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v27
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:320 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v28
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:400 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v28
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:324 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v28
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:328 ; 4-byte Folded Spill
+; GFX906-NEXT:    global_load_dwordx4 v[49:52], v4, s[4:5] offset:160
+; GFX906-NEXT:    global_load_dwordx4 v[21:24], v4, s[4:5] offset:176
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v49
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:340 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v49
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:164 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v49
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:168 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v50
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:344 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v50
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:236 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v50
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:240 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v51
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:348 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v51
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:260 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v51
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:264 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v52
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:352 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v52
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:268 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v52
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:272 ; 4-byte Folded Spill
+; GFX906-NEXT:    s_waitcnt vmcnt(12)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v21
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:244 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v21
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:80 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v21
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:32 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v22
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:248 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v22
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:36 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v22
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:40 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v23
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:252 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v23
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:44 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v23
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:48 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v24
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:256 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v24
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:52 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v24
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:56 ; 4-byte Folded Spill
+; GFX906-NEXT:    global_load_dwordx4 v[33:36], v4, s[4:5] offset:192
+; GFX906-NEXT:    global_load_dwordx4 v[17:20], v4, s[4:5] offset:208
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v33
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:292 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v33
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:60 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v33
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:64 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v34
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:296 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v34
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:108 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v34
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:112 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v35
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:300 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v35
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:172 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v35
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:176 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v36
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:304 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v36
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:228 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v36
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:232 ; 4-byte Folded Spill
+; GFX906-NEXT:    s_waitcnt vmcnt(12)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v17
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:116 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v17
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:120 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v17
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:124 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v18
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:128 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v18
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:132 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v18
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:136 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v19
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:140 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v19
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:144 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v19
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:148 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v20
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:152 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v20
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:156 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v20
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:160 ; 4-byte Folded Spill
+; GFX906-NEXT:    global_load_dwordx4 v[37:40], v4, s[4:5] offset:224
+; GFX906-NEXT:    s_nop 0
+; GFX906-NEXT:    global_load_dwordx4 v[9:12], v4, s[4:5] offset:240
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 8, v37
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:180 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 16, v37
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:184 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 24, v37
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:188 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 8, v38
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:192 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 16, v38
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:196 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 24, v38
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:200 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 8, v39
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:204 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 16, v39
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:208 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 24, v39
+; GFX906-NEXT:    s_waitcnt vmcnt(8)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v62, 8, v9
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:212 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 8, v40
+; GFX906-NEXT:    buffer_store_dword v62, off, s[8:11], 0 offset:68 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v62, 8, v11
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:216 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 16, v40
+; GFX906-NEXT:    buffer_store_dword v62, off, s[8:11], 0 offset:84 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v62, 16, v11
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:220 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 24, v40
+; GFX906-NEXT:    buffer_store_dword v62, off, s[8:11], 0 offset:96 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v62, 24, v11
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:224 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 16, v9
+; GFX906-NEXT:    buffer_store_dword v62, off, s[8:11], 0 offset:92 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v62, 8, v12
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:88 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v63, 24, v9
+; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 16, v10
+; GFX906-NEXT:    buffer_store_dword v62, off, s[8:11], 0 offset:104 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v62, 16, v12
+; GFX906-NEXT:    buffer_store_dword v63, off, s[8:11], 0 offset:72 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v63, 8, v10
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:76 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 24, v10
+; GFX906-NEXT:    buffer_store_dword v62, off, s[8:11], 0 offset:100 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 ; 4-byte Folded Spill
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    buffer_store_dword v10, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v11, off, s[8:11], 0 offset:8 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v12, off, s[8:11], 0 offset:12 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v62, 16, v5
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v12
+; GFX906-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX906-NEXT:    s_cbranch_execz .LBB6_2
+; GFX906-NEXT:  ; %bb.1: ; %bb.1
+; GFX906-NEXT:    global_load_dwordx4 v[5:8], v4, s[6:7]
+; GFX906-NEXT:    global_load_dwordx4 v[0:3], v4, s[6:7] offset:16
+; GFX906-NEXT:    global_load_dwordx4 v[29:32], v4, s[6:7] offset:32
+; GFX906-NEXT:    global_load_dwordx4 v[13:16], v4, s[6:7] offset:48
+; GFX906-NEXT:    s_waitcnt vmcnt(3)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v5
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:780 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v5
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:692 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v6
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:768 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v6
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:712 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v6
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:716 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v7
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:772 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v7
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:736 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v7
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:740 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v8
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:776 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v8
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:744 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v8
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:748 ; 4-byte Folded Spill
+; GFX906-NEXT:    s_waitcnt vmcnt(13)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v0
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:696 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v0
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:564 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v0
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:568 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v1
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:700 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:572 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v1
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:576 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v2
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:704 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v2
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:588 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v2
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:592 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v3
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:708 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v3
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:596 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:16 ; 4-byte Folded Spill
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    buffer_store_dword v1, off, s[8:11], 0 offset:20 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:24 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:28 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v62, 16, v5
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v3
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:600 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v29
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:752 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v29
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:636 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v29
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:640 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v30
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:756 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v30
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:668 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v30
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:672 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v31
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:760 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v31
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:676 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v31
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:680 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v32
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:764 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v32
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:684 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v32
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:688 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v13
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:652 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v13
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:500 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v13
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:504 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v14
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:656 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v14
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:524 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v14
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:528 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v15
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:660 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v15
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:548 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v15
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:552 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v16
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:664 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v16
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:556 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v16
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:560 ; 4-byte Folded Spill
+; GFX906-NEXT:    global_load_dwordx4 v[0:3], v4, s[6:7] offset:64
+; GFX906-NEXT:    s_nop 0
+; GFX906-NEXT:    global_load_dwordx4 v[45:48], v4, s[6:7] offset:80
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v0
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:720 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v0
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:580 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v0
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:584 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v1
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:724 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:604 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v1
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:608 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v2
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:728 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v2
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:628 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v2
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:632 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v3
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:732 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v3
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:644 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v3
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:648 ; 4-byte Folded Spill
+; GFX906-NEXT:    s_waitcnt vmcnt(12)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v45
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:612 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v45
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:468 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v45
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:472 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v46
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:616 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v46
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:476 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v46
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:480 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v47
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:620 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v47
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:508 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v47
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:512 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v48
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:624 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v48
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:516 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v48
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:520 ; 4-byte Folded Spill
+; GFX906-NEXT:    global_load_dwordx4 v[57:60], v4, s[6:7] offset:96
+; GFX906-NEXT:    global_load_dwordx4 v[41:44], v4, s[6:7] offset:112
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v57
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:532 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v57
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:404 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v57
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:408 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v58
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:536 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v58
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:428 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v58
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:432 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v59
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:540 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v59
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:452 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v59
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:456 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v60
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:544 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v60
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:460 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v60
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:464 ; 4-byte Folded Spill
+; GFX906-NEXT:    s_waitcnt vmcnt(12)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v41
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:436 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v41
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:308 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v41
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:312 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v42
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:440 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v42
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:332 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v42
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:336 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v43
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:444 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v43
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:356 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v43
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:360 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v44
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:448 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v44
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:364 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v44
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:368 ; 4-byte Folded Spill
+; GFX906-NEXT:    global_load_dwordx4 v[53:56], v4, s[6:7] offset:128
+; GFX906-NEXT:    global_load_dwordx4 v[25:28], v4, s[6:7] offset:144
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v53
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:484 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v53
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:372 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v53
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:376 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v54
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:488 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v54
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:380 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v54
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:384 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v55
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:492 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v55
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:412 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v55
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:416 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v56
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:496 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v56
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:420 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v56
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:424 ; 4-byte Folded Spill
+; GFX906-NEXT:    s_waitcnt vmcnt(12)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v25
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:388 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v25
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:276 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v25
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:280 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v26
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:392 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v26
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:284 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v26
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:288 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v27
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:396 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v27
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:316 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v27
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:320 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v28
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:400 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v28
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:324 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v28
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:328 ; 4-byte Folded Spill
+; GFX906-NEXT:    global_load_dwordx4 v[49:52], v4, s[6:7] offset:160
+; GFX906-NEXT:    global_load_dwordx4 v[21:24], v4, s[6:7] offset:176
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v49
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:340 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v49
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:164 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v49
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:168 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v50
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:344 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v50
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:236 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v50
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:240 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v51
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:348 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v51
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:260 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v51
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:264 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v52
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:352 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v52
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:268 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v52
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:272 ; 4-byte Folded Spill
+; GFX906-NEXT:    s_waitcnt vmcnt(12)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v21
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:244 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v21
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:80 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v21
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:32 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v22
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:248 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v22
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:36 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v22
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:40 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v23
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:252 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v23
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:44 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v23
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:48 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v24
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:256 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v24
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:52 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v24
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:56 ; 4-byte Folded Spill
+; GFX906-NEXT:    global_load_dwordx4 v[33:36], v4, s[6:7] offset:192
+; GFX906-NEXT:    global_load_dwordx4 v[17:20], v4, s[6:7] offset:208
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v33
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:292 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v33
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:60 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v33
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:64 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v34
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:296 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v34
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:108 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v34
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:112 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v35
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:300 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v35
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:172 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v35
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:176 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v36
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:304 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v36
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:228 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v36
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:232 ; 4-byte Folded Spill
+; GFX906-NEXT:    s_waitcnt vmcnt(12)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v17
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:116 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v17
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:120 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v17
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:124 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v18
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:128 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v18
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:132 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v18
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:136 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v19
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:140 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v19
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:144 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v19
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:148 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v20
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:152 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v20
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:156 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v20
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:160 ; 4-byte Folded Spill
+; GFX906-NEXT:    global_load_dwordx4 v[37:40], v4, s[6:7] offset:224
+; GFX906-NEXT:    s_nop 0
+; GFX906-NEXT:    global_load_dwordx4 v[9:12], v4, s[6:7] offset:240
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 8, v37
+; GFX906-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:180 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 16, v37
+; GFX906-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:184 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 24, v37
+; GFX906-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:188 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 8, v38
+; GFX906-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:192 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 16, v38
+; GFX906-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:196 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 24, v38
+; GFX906-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:200 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 8, v39
+; GFX906-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:204 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 16, v39
+; GFX906-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:208 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 24, v39
+; GFX906-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:212 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 8, v40
+; GFX906-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:216 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 16, v40
+; GFX906-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:220 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 24, v40
+; GFX906-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:224 ; 4-byte Folded Spill
+; GFX906-NEXT:    s_waitcnt vmcnt(12)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 8, v9
+; GFX906-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:68 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 16, v9
+; GFX906-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:88 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 24, v9
+; GFX906-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:72 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 16, v10
+; GFX906-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:76 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 8, v11
+; GFX906-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:84 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 16, v11
+; GFX906-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:96 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 24, v11
+; GFX906-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:92 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 8, v12
+; GFX906-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:104 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 16, v12
+; GFX906-NEXT:    v_lshrrev_b32_e32 v63, 8, v10
+; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 24, v10
+; GFX906-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:100 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 ; 4-byte Folded Spill
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    buffer_store_dword v10, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v11, off, s[8:11], 0 offset:8 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v12, off, s[8:11], 0 offset:12 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v12
+; GFX906-NEXT:  .LBB6_2: ; %bb.2
+; GFX906-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:784 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v63, off, s[8:11], 0 offset:788 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:792 ; 4-byte Folded Spill
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    buffer_store_dword v18, off, s[8:11], 0 offset:796 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v19, off, s[8:11], 0 offset:800 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v20, off, s[8:11], 0 offset:804 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_load_dword v10, off, s[8:11], 0 offset:780 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_mov_b32_e32 v4, 8
+; GFX906-NEXT:    v_mov_b32_e32 v63, 0xff
+; GFX906-NEXT:    v_mov_b32_e32 v18, v16
+; GFX906-NEXT:    v_mov_b32_e32 v17, v15
+; GFX906-NEXT:    v_mov_b32_e32 v16, v14
+; GFX906-NEXT:    v_mov_b32_e32 v15, v13
+; GFX906-NEXT:    v_mov_b32_e32 v19, v9
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v61, v4, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    buffer_load_dword v10, off, s[8:11], 0 offset:768 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_and_or_b32 v5, v5, v63, v61
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v61, v4, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    buffer_load_dword v10, off, s[8:11], 0 offset:772 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_and_or_b32 v6, v6, v63, v61
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v61, v4, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    buffer_load_dword v10, off, s[8:11], 0 offset:776 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_and_or_b32 v7, v7, v63, v61
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v61, v4, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    buffer_load_dword v10, off, s[8:11], 0 offset:692 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v11, off, s[8:11], 0 offset:16 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v12, off, s[8:11], 0 offset:20 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v13, off, s[8:11], 0 offset:24 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v14, off, s[8:11], 0 offset:28 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v9, off, s[8:11], 0 offset:712 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_and_or_b32 v8, v8, v63, v61
+; GFX906-NEXT:    v_and_b32_e32 v61, 0xff, v62
+; GFX906-NEXT:    v_lshlrev_b32_e32 v61, 16, v61
+; GFX906-NEXT:    s_waitcnt vmcnt(5)
+; GFX906-NEXT:    v_lshlrev_b32_e32 v62, 24, v10
+; GFX906-NEXT:    buffer_load_dword v10, off, s[8:11], 0 offset:716 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_and_b32_e32 v9, 0xff, v9
+; GFX906-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX906-NEXT:    v_or3_b32 v5, v5, v61, v62
+; GFX906-NEXT:    v_mov_b32_e32 v61, 0
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshlrev_b32_e32 v10, 24, v10
+; GFX906-NEXT:    v_or3_b32 v6, v6, v9, v10
+; GFX906-NEXT:    buffer_load_dword v9, off, s[8:11], 0 offset:736 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v10, off, s[8:11], 0 offset:740 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_and_b32_e32 v9, 0xff, v9
+; GFX906-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshlrev_b32_e32 v10, 24, v10
+; GFX906-NEXT:    v_or3_b32 v7, v7, v9, v10
+; GFX906-NEXT:    buffer_load_dword v9, off, s[8:11], 0 offset:744 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v10, off, s[8:11], 0 offset:748 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_and_b32_e32 v9, 0xff, v9
+; GFX906-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshlrev_b32_e32 v10, 24, v10
+; GFX906-NEXT:    v_or3_b32 v8, v8, v9, v10
+; GFX906-NEXT:    global_store_dwordx4 v61, v[5:8], s[2:3]
+; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:696 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_nop 0
+; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:700 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v9, off, s[8:11], 0 offset:564 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v10, off, s[8:11], 0 offset:568 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v7, off, s[8:11], 0 offset:704 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v8, off, s[8:11], 0 offset:708 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(5)
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v5, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    v_and_or_b32 v5, v11, v63, v5
+; GFX906-NEXT:    s_waitcnt vmcnt(4)
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v6, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    v_and_or_b32 v6, v12, v63, v6
+; GFX906-NEXT:    buffer_load_dword v11, off, s[8:11], 0 offset:572 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v12, off, s[8:11], 0 offset:576 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(5)
+; GFX906-NEXT:    v_and_b32_e32 v9, 0xff, v9
+; GFX906-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX906-NEXT:    s_waitcnt vmcnt(4)
+; GFX906-NEXT:    v_lshlrev_b32_e32 v10, 24, v10
+; GFX906-NEXT:    v_or3_b32 v5, v5, v9, v10
+; GFX906-NEXT:    buffer_load_dword v9, off, s[8:11], 0 offset:588 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v10, off, s[8:11], 0 offset:592 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(5)
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v7, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    v_and_or_b32 v7, v13, v63, v7
+; GFX906-NEXT:    s_waitcnt vmcnt(3)
+; GFX906-NEXT:    v_and_b32_e32 v11, 0xff, v11
+; GFX906-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX906-NEXT:    s_waitcnt vmcnt(2)
+; GFX906-NEXT:    v_lshlrev_b32_e32 v12, 24, v12
+; GFX906-NEXT:    v_or3_b32 v6, v6, v11, v12
+; GFX906-NEXT:    buffer_load_dword v11, off, s[8:11], 0 offset:668 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v8, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    s_waitcnt vmcnt(2)
+; GFX906-NEXT:    v_and_b32_e32 v9, 0xff, v9
+; GFX906-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshlrev_b32_e32 v10, 24, v10
+; GFX906-NEXT:    v_or3_b32 v7, v7, v9, v10
+; GFX906-NEXT:    buffer_load_dword v9, off, s[8:11], 0 offset:596 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v10, off, s[8:11], 0 offset:600 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v12, off, s[8:11], 0 offset:672 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_and_or_b32 v8, v14, v63, v8
+; GFX906-NEXT:    s_waitcnt vmcnt(3)
+; GFX906-NEXT:    v_and_b32_e32 v11, 0xff, v11
+; GFX906-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX906-NEXT:    s_waitcnt vmcnt(2)
+; GFX906-NEXT:    v_and_b32_e32 v9, 0xff, v9
+; GFX906-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshlrev_b32_e32 v10, 24, v10
+; GFX906-NEXT:    v_or3_b32 v8, v8, v9, v10
+; GFX906-NEXT:    global_store_dwordx4 v61, v[5:8], s[2:3] offset:16
+; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:752 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(2)
+; GFX906-NEXT:    v_lshlrev_b32_e32 v12, 24, v12
+; GFX906-NEXT:    buffer_load_dword v7, off, s[8:11], 0 offset:760 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v9, off, s[8:11], 0 offset:636 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v10, off, s[8:11], 0 offset:640 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:756 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v8, off, s[8:11], 0 offset:764 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(5)
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v5, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    v_and_or_b32 v5, v29, v63, v5
+; GFX906-NEXT:    s_waitcnt vmcnt(4)
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v7, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    s_waitcnt vmcnt(3)
+; GFX906-NEXT:    v_and_b32_e32 v9, 0xff, v9
+; GFX906-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX906-NEXT:    s_waitcnt vmcnt(2)
+; GFX906-NEXT:    v_lshlrev_b32_e32 v10, 24, v10
+; GFX906-NEXT:    v_or3_b32 v5, v5, v9, v10
+; GFX906-NEXT:    buffer_load_dword v9, off, s[8:11], 0 offset:676 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v10, off, s[8:11], 0 offset:680 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_and_or_b32 v7, v31, v63, v7
+; GFX906-NEXT:    s_waitcnt vmcnt(3)
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v6, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    v_and_or_b32 v6, v30, v63, v6
+; GFX906-NEXT:    v_or3_b32 v6, v6, v11, v12
+; GFX906-NEXT:    buffer_load_dword v11, off, s[8:11], 0 offset:524 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(2)
+; GFX906-NEXT:    v_and_b32_e32 v9, 0xff, v9
+; GFX906-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshlrev_b32_e32 v10, 24, v10
+; GFX906-NEXT:    v_or3_b32 v7, v7, v9, v10
+; GFX906-NEXT:    buffer_load_dword v9, off, s[8:11], 0 offset:684 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v10, off, s[8:11], 0 offset:688 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v8, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    v_and_or_b32 v8, v32, v63, v8
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_and_b32_e32 v9, 0xff, v9
+; GFX906-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshlrev_b32_e32 v10, 24, v10
+; GFX906-NEXT:    v_or3_b32 v8, v8, v9, v10
+; GFX906-NEXT:    global_store_dwordx4 v61, v[5:8], s[2:3] offset:32
+; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:652 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_nop 0
+; GFX906-NEXT:    buffer_load_dword v7, off, s[8:11], 0 offset:660 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v9, off, s[8:11], 0 offset:500 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v10, off, s[8:11], 0 offset:504 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v12, off, s[8:11], 0 offset:528 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:656 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v8, off, s[8:11], 0 offset:664 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_and_b32_e32 v11, 0xff, v11
+; GFX906-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX906-NEXT:    s_waitcnt vmcnt(6)
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v5, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    v_and_or_b32 v5, v15, v63, v5
+; GFX906-NEXT:    s_waitcnt vmcnt(5)
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v7, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    s_waitcnt vmcnt(4)
+; GFX906-NEXT:    v_and_b32_e32 v9, 0xff, v9
+; GFX906-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX906-NEXT:    s_waitcnt vmcnt(3)
+; GFX906-NEXT:    v_lshlrev_b32_e32 v10, 24, v10
+; GFX906-NEXT:    v_or3_b32 v5, v5, v9, v10
+; GFX906-NEXT:    buffer_load_dword v9, off, s[8:11], 0 offset:548 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v10, off, s[8:11], 0 offset:552 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_and_or_b32 v7, v17, v63, v7
+; GFX906-NEXT:    s_waitcnt vmcnt(3)
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v6, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    v_and_or_b32 v6, v16, v63, v6
+; GFX906-NEXT:    v_lshlrev_b32_e32 v12, 24, v12
+; GFX906-NEXT:    v_or3_b32 v6, v6, v11, v12
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_and_b32_e32 v9, 0xff, v9
+; GFX906-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshlrev_b32_e32 v10, 24, v10
+; GFX906-NEXT:    v_or3_b32 v7, v7, v9, v10
+; GFX906-NEXT:    buffer_load_dword v9, off, s[8:11], 0 offset:556 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v10, off, s[8:11], 0 offset:560 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v8, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    v_and_or_b32 v8, v18, v63, v8
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_and_b32_e32 v9, 0xff, v9
+; GFX906-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshlrev_b32_e32 v10, 24, v10
+; GFX906-NEXT:    v_or3_b32 v8, v8, v9, v10
+; GFX906-NEXT:    global_store_dwordx4 v61, v[5:8], s[2:3] offset:48
+; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:720 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v5, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    v_and_or_b32 v0, v0, v63, v5
+; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:724 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:584 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v7, off, s[8:11], 0 offset:604 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v8, off, s[8:11], 0 offset:608 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(3)
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v5, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    v_and_or_b32 v1, v1, v63, v5
+; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:728 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(3)
+; GFX906-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
+; GFX906-NEXT:    s_waitcnt vmcnt(2)
+; GFX906-NEXT:    v_and_b32_e32 v7, 0xff, v7
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshlrev_b32_e32 v8, 24, v8
+; GFX906-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX906-NEXT:    v_or3_b32 v1, v1, v7, v8
+; GFX906-NEXT:    buffer_load_dword v7, off, s[8:11], 0 offset:476 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v5, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    v_and_or_b32 v2, v2, v63, v5
+; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:732 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v5, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    v_and_or_b32 v3, v3, v63, v5
+; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:580 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX906-NEXT:    v_or3_b32 v0, v0, v5, v6
+; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:628 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:632 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
+; GFX906-NEXT:    v_or3_b32 v2, v2, v5, v6
+; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:644 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:648 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
+; GFX906-NEXT:    v_or3_b32 v3, v3, v5, v6
+; GFX906-NEXT:    global_store_dwordx4 v61, v[0:3], s[2:3] offset:64
+; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:612 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_nop 0
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:620 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:468 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:472 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v8, off, s[8:11], 0 offset:480 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:616 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:624 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_and_b32_e32 v7, 0xff, v7
+; GFX906-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX906-NEXT:    s_waitcnt vmcnt(6)
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    v_and_or_b32 v0, v45, v63, v0
+; GFX906-NEXT:    s_waitcnt vmcnt(5)
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    s_waitcnt vmcnt(4)
+; GFX906-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX906-NEXT:    s_waitcnt vmcnt(3)
+; GFX906-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
+; GFX906-NEXT:    v_or3_b32 v0, v0, v5, v6
+; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:508 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:512 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_and_or_b32 v2, v47, v63, v2
+; GFX906-NEXT:    s_waitcnt vmcnt(3)
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    v_and_or_b32 v1, v46, v63, v1
+; GFX906-NEXT:    v_lshlrev_b32_e32 v8, 24, v8
+; GFX906-NEXT:    v_or3_b32 v1, v1, v7, v8
+; GFX906-NEXT:    buffer_load_dword v7, off, s[8:11], 0 offset:428 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(2)
+; GFX906-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
+; GFX906-NEXT:    v_or3_b32 v2, v2, v5, v6
+; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:516 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:520 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    v_and_or_b32 v3, v48, v63, v3
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
+; GFX906-NEXT:    v_or3_b32 v3, v3, v5, v6
+; GFX906-NEXT:    global_store_dwordx4 v61, v[0:3], s[2:3] offset:80
+; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:532 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_nop 0
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:540 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:404 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:408 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v8, off, s[8:11], 0 offset:432 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:536 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:544 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_and_b32_e32 v7, 0xff, v7
+; GFX906-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX906-NEXT:    s_waitcnt vmcnt(6)
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    v_and_or_b32 v0, v57, v63, v0
+; GFX906-NEXT:    s_waitcnt vmcnt(5)
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    s_waitcnt vmcnt(4)
+; GFX906-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX906-NEXT:    s_waitcnt vmcnt(3)
+; GFX906-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
+; GFX906-NEXT:    v_or3_b32 v0, v0, v5, v6
+; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:452 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:456 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_and_or_b32 v2, v59, v63, v2
+; GFX906-NEXT:    s_waitcnt vmcnt(3)
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    v_and_or_b32 v1, v58, v63, v1
+; GFX906-NEXT:    v_lshlrev_b32_e32 v8, 24, v8
+; GFX906-NEXT:    v_or3_b32 v1, v1, v7, v8
+; GFX906-NEXT:    buffer_load_dword v7, off, s[8:11], 0 offset:332 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(2)
+; GFX906-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
+; GFX906-NEXT:    v_or3_b32 v2, v2, v5, v6
+; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:460 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:464 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    v_and_or_b32 v3, v60, v63, v3
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
+; GFX906-NEXT:    v_or3_b32 v3, v3, v5, v6
+; GFX906-NEXT:    global_store_dwordx4 v61, v[0:3], s[2:3] offset:96
+; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:436 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_nop 0
+; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:440 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:308 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:312 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v8, off, s[8:11], 0 offset:336 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:444 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:448 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_and_b32_e32 v7, 0xff, v7
+; GFX906-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX906-NEXT:    s_waitcnt vmcnt(6)
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    v_and_or_b32 v0, v41, v63, v0
+; GFX906-NEXT:    s_waitcnt vmcnt(5)
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    s_waitcnt vmcnt(4)
+; GFX906-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX906-NEXT:    s_waitcnt vmcnt(3)
+; GFX906-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
+; GFX906-NEXT:    v_or3_b32 v0, v0, v5, v6
+; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:356 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:360 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_and_or_b32 v1, v42, v63, v1
+; GFX906-NEXT:    s_waitcnt vmcnt(4)
+; GFX906-NEXT:    v_lshlrev_b32_e32 v8, 24, v8
+; GFX906-NEXT:    v_or3_b32 v1, v1, v7, v8
+; GFX906-NEXT:    buffer_load_dword v7, off, s[8:11], 0 offset:380 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(4)
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    v_and_or_b32 v2, v43, v63, v2
+; GFX906-NEXT:    buffer_load_dword v8, off, s[8:11], 0 offset:384 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(4)
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    v_and_or_b32 v3, v44, v63, v3
+; GFX906-NEXT:    s_waitcnt vmcnt(3)
+; GFX906-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX906-NEXT:    s_waitcnt vmcnt(2)
+; GFX906-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
+; GFX906-NEXT:    v_or3_b32 v2, v2, v5, v6
+; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:364 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:368 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(3)
+; GFX906-NEXT:    v_and_b32_e32 v7, 0xff, v7
+; GFX906-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX906-NEXT:    s_waitcnt vmcnt(2)
+; GFX906-NEXT:    v_lshlrev_b32_e32 v8, 24, v8
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
+; GFX906-NEXT:    v_or3_b32 v3, v3, v5, v6
+; GFX906-NEXT:    global_store_dwordx4 v61, v[0:3], s[2:3] offset:112
+; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:484 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_nop 0
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:492 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:372 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:376 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:488 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:496 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(5)
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    v_and_or_b32 v0, v53, v63, v0
+; GFX906-NEXT:    s_waitcnt vmcnt(4)
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    s_waitcnt vmcnt(3)
+; GFX906-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX906-NEXT:    s_waitcnt vmcnt(2)
+; GFX906-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
+; GFX906-NEXT:    v_or3_b32 v0, v0, v5, v6
+; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:412 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:416 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_and_or_b32 v2, v55, v63, v2
+; GFX906-NEXT:    s_waitcnt vmcnt(3)
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    v_and_or_b32 v1, v54, v63, v1
+; GFX906-NEXT:    v_or3_b32 v1, v1, v7, v8
+; GFX906-NEXT:    buffer_load_dword v7, off, s[8:11], 0 offset:284 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(2)
+; GFX906-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
+; GFX906-NEXT:    v_or3_b32 v2, v2, v5, v6
+; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:420 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:424 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    v_and_or_b32 v3, v56, v63, v3
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
+; GFX906-NEXT:    v_or3_b32 v3, v3, v5, v6
+; GFX906-NEXT:    global_store_dwordx4 v61, v[0:3], s[2:3] offset:128
+; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:388 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_nop 0
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:396 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:276 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:280 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v8, off, s[8:11], 0 offset:288 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:392 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:400 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_and_b32_e32 v7, 0xff, v7
+; GFX906-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX906-NEXT:    s_waitcnt vmcnt(6)
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    v_and_or_b32 v0, v25, v63, v0
+; GFX906-NEXT:    s_waitcnt vmcnt(5)
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    s_waitcnt vmcnt(4)
+; GFX906-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX906-NEXT:    s_waitcnt vmcnt(3)
+; GFX906-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
+; GFX906-NEXT:    v_or3_b32 v0, v0, v5, v6
+; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:316 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:320 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_and_or_b32 v2, v27, v63, v2
+; GFX906-NEXT:    s_waitcnt vmcnt(3)
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    v_and_or_b32 v1, v26, v63, v1
+; GFX906-NEXT:    v_lshlrev_b32_e32 v8, 24, v8
+; GFX906-NEXT:    v_or3_b32 v1, v1, v7, v8
+; GFX906-NEXT:    buffer_load_dword v7, off, s[8:11], 0 offset:236 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(2)
+; GFX906-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
+; GFX906-NEXT:    v_or3_b32 v2, v2, v5, v6
+; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:324 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:328 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    v_and_or_b32 v3, v28, v63, v3
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
+; GFX906-NEXT:    v_or3_b32 v3, v3, v5, v6
+; GFX906-NEXT:    global_store_dwordx4 v61, v[0:3], s[2:3] offset:144
+; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:340 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_nop 0
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:348 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:164 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:168 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v8, off, s[8:11], 0 offset:240 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:344 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:352 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(6)
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    v_and_or_b32 v0, v49, v63, v0
+; GFX906-NEXT:    s_waitcnt vmcnt(5)
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    s_waitcnt vmcnt(4)
+; GFX906-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX906-NEXT:    s_waitcnt vmcnt(3)
+; GFX906-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
+; GFX906-NEXT:    v_or3_b32 v0, v0, v5, v6
+; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:260 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:264 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_and_or_b32 v2, v51, v63, v2
+; GFX906-NEXT:    s_waitcnt vmcnt(3)
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    v_and_or_b32 v1, v50, v63, v1
+; GFX906-NEXT:    v_lshlrev_b32_e32 v8, 24, v8
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
+; GFX906-NEXT:    v_or3_b32 v2, v2, v5, v6
+; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:268 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:272 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    v_and_or_b32 v3, v52, v63, v3
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
+; GFX906-NEXT:    v_or3_b32 v3, v3, v5, v6
+; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:80 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_and_b32_e32 v7, 0xff, v7
+; GFX906-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX906-NEXT:    v_or3_b32 v1, v1, v7, v8
+; GFX906-NEXT:    buffer_load_dword v8, off, s[8:11], 0 offset:40 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:32 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v7, off, s[8:11], 0 offset:36 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(3)
+; GFX906-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX906-NEXT:    global_store_dwordx4 v61, v[0:3], s[2:3] offset:160
+; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:244 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:248 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(4)
+; GFX906-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:252 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(4)
+; GFX906-NEXT:    v_and_b32_e32 v7, 0xff, v7
+; GFX906-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX906-NEXT:    v_lshlrev_b32_e32 v8, 24, v8
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:256 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(3)
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    v_and_or_b32 v0, v21, v63, v0
+; GFX906-NEXT:    v_or3_b32 v0, v0, v5, v6
+; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:44 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:48 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(4)
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    v_and_or_b32 v1, v22, v63, v1
+; GFX906-NEXT:    v_or3_b32 v1, v1, v7, v8
+; GFX906-NEXT:    buffer_load_dword v7, off, s[8:11], 0 offset:108 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(4)
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    v_and_or_b32 v2, v23, v63, v2
+; GFX906-NEXT:    buffer_load_dword v8, off, s[8:11], 0 offset:112 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(4)
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    v_and_or_b32 v3, v24, v63, v3
+; GFX906-NEXT:    s_waitcnt vmcnt(3)
+; GFX906-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX906-NEXT:    s_waitcnt vmcnt(2)
+; GFX906-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
+; GFX906-NEXT:    v_or3_b32 v2, v2, v5, v6
+; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:52 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:56 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(3)
+; GFX906-NEXT:    v_and_b32_e32 v7, 0xff, v7
+; GFX906-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX906-NEXT:    s_waitcnt vmcnt(2)
+; GFX906-NEXT:    v_lshlrev_b32_e32 v8, 24, v8
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
+; GFX906-NEXT:    v_or3_b32 v3, v3, v5, v6
+; GFX906-NEXT:    global_store_dwordx4 v61, v[0:3], s[2:3] offset:176
+; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:292 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_nop 0
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:300 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:60 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:64 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:296 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:304 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(5)
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    v_and_or_b32 v0, v33, v63, v0
+; GFX906-NEXT:    s_waitcnt vmcnt(4)
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    s_waitcnt vmcnt(3)
+; GFX906-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX906-NEXT:    s_waitcnt vmcnt(2)
+; GFX906-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
+; GFX906-NEXT:    v_or3_b32 v0, v0, v5, v6
+; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:172 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:176 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_and_or_b32 v2, v35, v63, v2
+; GFX906-NEXT:    s_waitcnt vmcnt(3)
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    v_and_or_b32 v1, v34, v63, v1
+; GFX906-NEXT:    v_or3_b32 v1, v1, v7, v8
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
+; GFX906-NEXT:    v_or3_b32 v2, v2, v5, v6
+; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:228 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:232 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    v_and_or_b32 v3, v36, v63, v3
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
+; GFX906-NEXT:    v_or3_b32 v3, v3, v5, v6
+; GFX906-NEXT:    global_store_dwordx4 v61, v[0:3], s[2:3] offset:192
+; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:116 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_nop 0
+; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:792 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:796 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v7, off, s[8:11], 0 offset:800 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v8, off, s[8:11], 0 offset:804 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:120 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:124 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:136 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(7)
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    s_waitcnt vmcnt(3)
+; GFX906-NEXT:    v_and_or_b32 v0, v5, v63, v0
+; GFX906-NEXT:    s_waitcnt vmcnt(2)
+; GFX906-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX906-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
+; GFX906-NEXT:    v_or3_b32 v0, v0, v1, v2
+; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:128 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:132 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:148 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(3)
+; GFX906-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
+; GFX906-NEXT:    s_waitcnt vmcnt(2)
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX906-NEXT:    v_and_or_b32 v1, v6, v63, v1
+; GFX906-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX906-NEXT:    v_or3_b32 v1, v1, v2, v3
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:140 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:144 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:160 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(3)
+; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
+; GFX906-NEXT:    s_waitcnt vmcnt(2)
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX906-NEXT:    v_and_or_b32 v2, v7, v63, v2
+; GFX906-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX906-NEXT:    v_or3_b32 v2, v2, v3, v5
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:152 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:156 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(2)
+; GFX906-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX906-NEXT:    v_and_or_b32 v3, v8, v63, v3
+; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX906-NEXT:    v_or3_b32 v3, v3, v5, v6
+; GFX906-NEXT:    global_store_dwordx4 v61, v[0:3], s[2:3] offset:208
+; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:180 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_nop 0
+; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:184 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:188 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:200 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:224 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:212 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(5)
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    v_and_or_b32 v0, v37, v63, v0
+; GFX906-NEXT:    s_waitcnt vmcnt(4)
+; GFX906-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX906-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX906-NEXT:    s_waitcnt vmcnt(3)
+; GFX906-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
+; GFX906-NEXT:    v_or3_b32 v0, v0, v1, v2
+; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:192 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:196 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(4)
+; GFX906-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
+; GFX906-NEXT:    s_waitcnt vmcnt(2)
+; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
+; GFX906-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX906-NEXT:    v_and_or_b32 v1, v38, v63, v1
+; GFX906-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX906-NEXT:    v_or3_b32 v1, v1, v2, v3
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:204 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:208 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX906-NEXT:    v_and_or_b32 v2, v39, v63, v2
+; GFX906-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX906-NEXT:    v_or3_b32 v2, v2, v3, v5
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:216 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:220 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX906-NEXT:    v_and_or_b32 v3, v40, v63, v3
+; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX906-NEXT:    v_or3_b32 v3, v3, v5, v6
+; GFX906-NEXT:    global_store_dwordx4 v61, v[0:3], s[2:3] offset:224
+; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:68 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_nop 0
+; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:4 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v7, off, s[8:11], 0 offset:8 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v8, off, s[8:11], 0 offset:12 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:88 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:784 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:72 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(7)
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    s_waitcnt vmcnt(3)
+; GFX906-NEXT:    v_and_or_b32 v0, v5, v63, v0
+; GFX906-NEXT:    s_waitcnt vmcnt(2)
+; GFX906-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX906-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
+; GFX906-NEXT:    v_or3_b32 v0, v0, v1, v2
+; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:788 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:92 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:76 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
+; GFX906-NEXT:    s_waitcnt vmcnt(2)
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    v_and_or_b32 v1, v6, v63, v1
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX906-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX906-NEXT:    v_or3_b32 v1, v1, v2, v3
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:84 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:96 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX906-NEXT:    v_and_or_b32 v2, v7, v63, v2
+; GFX906-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX906-NEXT:    v_or3_b32 v2, v2, v3, v5
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:104 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 24, v19
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    buffer_load_dword v4, off, s[8:11], 0 offset:100 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_and_or_b32 v3, v8, v63, v3
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX906-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX906-NEXT:    v_or3_b32 v3, v3, v4, v5
+; GFX906-NEXT:    global_store_dwordx4 v61, v[0:3], s[2:3] offset:240
+; GFX906-NEXT:    s_endpgm
+entry:
+  %idx = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx
+  %vec1 = load <256 x i8>, ptr addrspace(1) %gep1
+  %gep2 = getelementptr <8 x i8>, ptr addrspace(1) %src2, i32 %idx
+  %vec2 = load <256 x i8>, ptr addrspace(1) %gep2
+  %cmp = icmp ult i32 %idx, 15
+  br i1 %cmp, label %bb.1, label %bb.2
+bb.1:
+  br label %bb.2
+
+bb.2:
+  %tmp5 = phi <256 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ]
+  store <256 x i8> %tmp5, ptr addrspace(1) %dst, align 4
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x()
+
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-loop-carried.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-loop-carried.ll
new file mode 100644
index 0000000000000..95c541e2e60b7
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-loop-carried.ll
@@ -0,0 +1,67 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx906 < %s | FileCheck --check-prefix=GFX906 %s
+
+define amdgpu_kernel void @v32i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
+; GFX906-LABEL: v32i8_liveout:
+; GFX906:       ; %bb.0: ; %entry
+; GFX906-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX906-NEXT:    v_lshlrev_b32_e32 v1, 5, v0
+; GFX906-NEXT:    v_cmp_le_u32_e32 vcc, 15, v0
+; GFX906-NEXT:    v_mov_b32_e32 v3, 0xff
+; GFX906-NEXT:    v_mov_b32_e32 v4, 8
+; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX906-NEXT:    global_load_dword v1, v1, s[2:3]
+; GFX906-NEXT:    s_mov_b64 s[2:3], 0
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
+; GFX906-NEXT:    v_lshrrev_b32_e32 v6, 8, v1
+; GFX906-NEXT:    v_lshrrev_b32_e32 v2, 24, v1
+; GFX906-NEXT:    v_mov_b32_e32 v5, v0
+; GFX906-NEXT:  .LBB0_1: ; %bb.1
+; GFX906-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v6, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX906-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX906-NEXT:    v_and_or_b32 v6, v1, v3, v6
+; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX906-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
+; GFX906-NEXT:    s_and_b64 s[4:5], exec, vcc
+; GFX906-NEXT:    v_or3_b32 v5, v6, v5, v2
+; GFX906-NEXT:    s_or_b64 s[2:3], s[4:5], s[2:3]
+; GFX906-NEXT:    v_lshrrev_b32_e32 v2, 16, v5
+; GFX906-NEXT:    v_mov_b32_e32 v6, v0
+; GFX906-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX906-NEXT:    s_cbranch_execnz .LBB0_1
+; GFX906-NEXT:  ; %bb.2: ; %bb.2.loopexit
+; GFX906-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX906-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX906-NEXT:    v_mov_b32_e32 v4, 8
+; GFX906-NEXT:    v_mov_b32_e32 v3, 0xff
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    v_and_or_b32 v0, v1, v3, v0
+; GFX906-NEXT:    v_and_b32_e32 v1, 0xff, v5
+; GFX906-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX906-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX906-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
+; GFX906-NEXT:    v_or3_b32 v0, v0, v1, v2
+; GFX906-NEXT:    v_mov_b32_e32 v1, 0
+; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX906-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX906-NEXT:    s_endpgm
+entry:
+  %idx = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep1 = getelementptr <32 x i8>, ptr addrspace(1) %src1, i32 %idx
+  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
+  br label %bb.1
+
+bb.1:
+  %temp = phi <4 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ]
+  %vec2 = shufflevector <4 x i8> %vec1, <4 x i8> %temp, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %cmp = icmp ult i32 %idx, 15
+  br i1 %cmp, label %bb.1, label %bb.2
+  br label %bb.2
+
+bb.2:
+  store <4 x i8> %vec2, ptr addrspace(1) %dst, align 4
+  ret void
+}

>From 95bd7877c4fa0cac3cb1407ab7329227f1511293 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Fri, 19 Apr 2024 10:58:25 -0700
Subject: [PATCH 09/17] Enable for GlobalISel

Change-Id: I83ae012da3118b0a40fb8a80be5029ce5bd2d78a
---
 .../AMDGPU/AMDGPULateCodeGenPrepare.cpp       |    4 -
 .../AMDGPU/GlobalISel/vni8-across-blocks.ll   | 2067 ++---------------
 .../AMDGPU/GlobalISel/vni8-loop-carried.ll    |   37 +-
 3 files changed, 198 insertions(+), 1910 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
index d0e0977d7bb4e..822b85fac5188 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
@@ -194,10 +194,6 @@ bool AMDGPULateCodeGenPrepare::runOnFunction(Function &F) {
   for (auto &BB : F)
     for (Instruction &I : llvm::make_early_inc_range(BB)) {
       Changed |= visit(I);
-      // GlobalISel should directly use the values, and do not need to emit
-      // CopyTo/CopyFrom Regs across blocks
-      if (TM.Options.EnableGlobalISel)
-        continue;
       if (!LRO.shouldReplaceUses(I))
         continue;
       Changed |= LRO.replaceUses(I);
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll
index cbb8fede31efa..3def10e73717b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll
@@ -6,28 +6,36 @@ define amdgpu_kernel void @v3i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1)
 ; GFX906:       ; %bb.0: ; %entry
 ; GFX906-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX906-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX906-NEXT:    v_lshlrev_b32_e32 v4, 2, v0
-; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v0
+; GFX906-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; GFX906-NEXT:    v_mov_b32_e32 v3, 8
+; GFX906-NEXT:    v_mov_b32_e32 v5, 16
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX906-NEXT:    global_load_dword v1, v4, s[4:5]
+; GFX906-NEXT:    global_load_dword v4, v2, s[4:5]
+; GFX906-NEXT:    v_mov_b32_e32 v1, 0xff
+; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v0
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v1
-; GFX906-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GFX906-NEXT:    v_and_b32_e32 v6, 0xff, v4
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v7, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX906-NEXT:    v_or3_b32 v4, v6, v7, v4
 ; GFX906-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX906-NEXT:    s_cbranch_execz .LBB0_2
 ; GFX906-NEXT:  ; %bb.1: ; %bb.1
-; GFX906-NEXT:    global_load_dword v1, v4, s[6:7]
+; GFX906-NEXT:    global_load_dword v0, v2, s[6:7]
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v1
-; GFX906-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GFX906-NEXT:    v_and_b32_e32 v2, 0xff, v0
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v3, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX906-NEXT:    v_or3_b32 v4, v2, v3, v0
 ; GFX906-NEXT:  .LBB0_2: ; %bb.2
 ; GFX906-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX906-NEXT:    v_and_b32_e32 v0, 0xff, v3
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v4
+; GFX906-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX906-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    s_mov_b32 s0, 0xffff
+; GFX906-NEXT:    v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_and_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX906-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX906-NEXT:    v_and_b32_sdwa v1, s0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX906-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; GFX906-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX906-NEXT:    global_store_short v1, v0, s[2:3]
@@ -55,34 +63,19 @@ define amdgpu_kernel void @v4i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1)
 ; GFX906:       ; %bb.0: ; %entry
 ; GFX906-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX906-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 2, v0
+; GFX906-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v0
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX906-NEXT:    global_load_dword v1, v5, s[4:5]
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v2, 8, v1
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
-; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 24, v1
+; GFX906-NEXT:    global_load_dword v1, v2, s[4:5]
 ; GFX906-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX906-NEXT:    s_cbranch_execz .LBB1_2
 ; GFX906-NEXT:  ; %bb.1: ; %bb.1
-; GFX906-NEXT:    global_load_dword v1, v5, s[6:7]
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v2, 8, v1
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
-; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 24, v1
+; GFX906-NEXT:    global_load_dword v1, v2, s[6:7]
 ; GFX906-NEXT:  .LBB1_2: ; %bb.2
 ; GFX906-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX906-NEXT:    v_mov_b32_e32 v5, 8
-; GFX906-NEXT:    v_mov_b32_e32 v0, 0xff
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    v_and_or_b32 v0, v1, v0, v2
-; GFX906-NEXT:    v_and_b32_e32 v1, 0xff, v3
-; GFX906-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX906-NEXT:    v_lshlrev_b32_e32 v2, 24, v4
-; GFX906-NEXT:    v_or3_b32 v0, v0, v1, v2
-; GFX906-NEXT:    v_mov_b32_e32 v1, 0
-; GFX906-NEXT:    global_store_dword v1, v0, s[2:3]
+; GFX906-NEXT:    v_mov_b32_e32 v0, 0
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    global_store_dword v0, v1, s[2:3]
 ; GFX906-NEXT:    s_endpgm
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -106,30 +99,28 @@ define amdgpu_kernel void @v5i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1)
 ; GFX906:       ; %bb.0: ; %entry
 ; GFX906-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX906-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX906-NEXT:    v_lshlrev_b32_e32 v6, 3, v0
+; GFX906-NEXT:    v_lshlrev_b32_e32 v3, 3, v0
 ; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v0
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX906-NEXT:    global_load_dwordx2 v[1:2], v6, s[4:5]
+; GFX906-NEXT:    global_load_dwordx2 v[1:2], v3, s[4:5]
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v1
-; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
-; GFX906-NEXT:    v_lshrrev_b32_e32 v5, 24, v1
+; GFX906-NEXT:    v_and_b32_e32 v2, 0xff, v2
 ; GFX906-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX906-NEXT:    s_cbranch_execz .LBB2_2
 ; GFX906-NEXT:  ; %bb.1: ; %bb.1
-; GFX906-NEXT:    global_load_dwordx2 v[1:2], v6, s[6:7]
+; GFX906-NEXT:    global_load_dwordx2 v[1:2], v3, s[6:7]
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v1
-; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
-; GFX906-NEXT:    v_lshrrev_b32_e32 v5, 24, v1
+; GFX906-NEXT:    v_and_b32_e32 v2, 0xff, v2
 ; GFX906-NEXT:  .LBB2_2: ; %bb.2
 ; GFX906-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX906-NEXT:    v_mov_b32_e32 v0, 0
-; GFX906-NEXT:    global_store_byte v0, v1, s[2:3]
-; GFX906-NEXT:    global_store_byte v0, v3, s[2:3] offset:1
-; GFX906-NEXT:    global_store_byte v0, v4, s[2:3] offset:2
-; GFX906-NEXT:    global_store_byte v0, v5, s[2:3] offset:3
-; GFX906-NEXT:    global_store_byte v0, v2, s[2:3] offset:4
+; GFX906-NEXT:    v_mov_b32_e32 v4, 0
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v1
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v1
+; GFX906-NEXT:    global_store_byte v4, v1, s[2:3]
+; GFX906-NEXT:    global_store_byte v4, v0, s[2:3] offset:1
+; GFX906-NEXT:    global_store_byte_d16_hi v4, v1, s[2:3] offset:2
+; GFX906-NEXT:    global_store_byte v4, v3, s[2:3] offset:3
+; GFX906-NEXT:    global_store_byte v4, v2, s[2:3] offset:4
 ; GFX906-NEXT:    s_endpgm
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -153,46 +144,19 @@ define amdgpu_kernel void @v8i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1)
 ; GFX906:       ; %bb.0: ; %entry
 ; GFX906-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX906-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX906-NEXT:    v_lshlrev_b32_e32 v9, 3, v0
+; GFX906-NEXT:    v_lshlrev_b32_e32 v3, 3, v0
 ; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v0
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX906-NEXT:    global_load_dwordx2 v[1:2], v9, s[4:5]
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v1
-; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
-; GFX906-NEXT:    v_lshrrev_b32_e32 v5, 24, v1
-; GFX906-NEXT:    v_lshrrev_b32_e32 v6, 8, v2
-; GFX906-NEXT:    v_lshrrev_b32_e32 v7, 16, v2
-; GFX906-NEXT:    v_lshrrev_b32_e32 v8, 24, v2
+; GFX906-NEXT:    global_load_dwordx2 v[1:2], v3, s[4:5]
 ; GFX906-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX906-NEXT:    s_cbranch_execz .LBB3_2
 ; GFX906-NEXT:  ; %bb.1: ; %bb.1
-; GFX906-NEXT:    global_load_dwordx2 v[1:2], v9, s[6:7]
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v1
-; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
-; GFX906-NEXT:    v_lshrrev_b32_e32 v5, 24, v1
-; GFX906-NEXT:    v_lshrrev_b32_e32 v6, 8, v2
-; GFX906-NEXT:    v_lshrrev_b32_e32 v7, 16, v2
-; GFX906-NEXT:    v_lshrrev_b32_e32 v8, 24, v2
+; GFX906-NEXT:    global_load_dwordx2 v[1:2], v3, s[6:7]
 ; GFX906-NEXT:  .LBB3_2: ; %bb.2
 ; GFX906-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX906-NEXT:    v_mov_b32_e32 v10, 8
-; GFX906-NEXT:    v_mov_b32_e32 v9, 0xff
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v0, v10, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    v_and_or_b32 v0, v1, v9, v0
-; GFX906-NEXT:    v_and_b32_e32 v1, 0xff, v4
-; GFX906-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX906-NEXT:    v_lshlrev_b32_e32 v3, 24, v5
-; GFX906-NEXT:    v_or3_b32 v0, v0, v1, v3
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v1, v10, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    v_and_or_b32 v1, v2, v9, v1
-; GFX906-NEXT:    v_and_b32_e32 v2, 0xff, v7
-; GFX906-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX906-NEXT:    v_lshlrev_b32_e32 v3, 24, v8
-; GFX906-NEXT:    v_or3_b32 v1, v1, v2, v3
-; GFX906-NEXT:    v_mov_b32_e32 v2, 0
-; GFX906-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX906-NEXT:    v_mov_b32_e32 v0, 0
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    global_store_dwordx2 v0, v[1:2], s[2:3]
 ; GFX906-NEXT:    s_endpgm
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -216,70 +180,19 @@ define amdgpu_kernel void @v16i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1
 ; GFX906:       ; %bb.0: ; %entry
 ; GFX906-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX906-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX906-NEXT:    v_lshlrev_b32_e32 v17, 4, v0
+; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 4, v0
 ; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v0
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX906-NEXT:    global_load_dwordx4 v[1:4], v17, s[4:5]
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
-; GFX906-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
-; GFX906-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
-; GFX906-NEXT:    v_lshrrev_b32_e32 v8, 8, v2
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v2
-; GFX906-NEXT:    v_lshrrev_b32_e32 v10, 24, v2
-; GFX906-NEXT:    v_lshrrev_b32_e32 v11, 8, v3
-; GFX906-NEXT:    v_lshrrev_b32_e32 v12, 16, v3
-; GFX906-NEXT:    v_lshrrev_b32_e32 v13, 24, v3
-; GFX906-NEXT:    v_lshrrev_b32_e32 v14, 8, v4
-; GFX906-NEXT:    v_lshrrev_b32_e32 v15, 16, v4
-; GFX906-NEXT:    v_lshrrev_b32_e32 v16, 24, v4
+; GFX906-NEXT:    global_load_dwordx4 v[1:4], v5, s[4:5]
 ; GFX906-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX906-NEXT:    s_cbranch_execz .LBB4_2
 ; GFX906-NEXT:  ; %bb.1: ; %bb.1
-; GFX906-NEXT:    global_load_dwordx4 v[1:4], v17, s[6:7]
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
-; GFX906-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
-; GFX906-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
-; GFX906-NEXT:    v_lshrrev_b32_e32 v8, 8, v2
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v2
-; GFX906-NEXT:    v_lshrrev_b32_e32 v10, 24, v2
-; GFX906-NEXT:    v_lshrrev_b32_e32 v11, 8, v3
-; GFX906-NEXT:    v_lshrrev_b32_e32 v12, 16, v3
-; GFX906-NEXT:    v_lshrrev_b32_e32 v13, 24, v3
-; GFX906-NEXT:    v_lshrrev_b32_e32 v14, 8, v4
-; GFX906-NEXT:    v_lshrrev_b32_e32 v15, 16, v4
-; GFX906-NEXT:    v_lshrrev_b32_e32 v16, 24, v4
+; GFX906-NEXT:    global_load_dwordx4 v[1:4], v5, s[6:7]
 ; GFX906-NEXT:  .LBB4_2: ; %bb.2
 ; GFX906-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX906-NEXT:    v_mov_b32_e32 v18, 8
-; GFX906-NEXT:    v_mov_b32_e32 v17, 0xff
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v0, v18, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    v_and_or_b32 v0, v1, v17, v0
-; GFX906-NEXT:    v_and_b32_e32 v1, 0xff, v6
-; GFX906-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 24, v7
-; GFX906-NEXT:    v_or3_b32 v0, v0, v1, v5
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v1, v18, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    v_and_or_b32 v1, v2, v17, v1
-; GFX906-NEXT:    v_and_b32_e32 v2, 0xff, v9
-; GFX906-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 24, v10
-; GFX906-NEXT:    v_or3_b32 v1, v1, v2, v5
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v2, v18, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    v_and_or_b32 v2, v3, v17, v2
-; GFX906-NEXT:    v_and_b32_e32 v3, 0xff, v12
-; GFX906-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 24, v13
-; GFX906-NEXT:    v_or3_b32 v2, v2, v3, v5
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v3, v18, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    v_and_or_b32 v3, v4, v17, v3
-; GFX906-NEXT:    v_and_b32_e32 v4, 0xff, v15
-; GFX906-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 24, v16
-; GFX906-NEXT:    v_or3_b32 v3, v3, v4, v5
-; GFX906-NEXT:    v_mov_b32_e32 v4, 0
-; GFX906-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
+; GFX906-NEXT:    v_mov_b32_e32 v0, 0
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    global_store_dwordx4 v0, v[1:4], s[2:3]
 ; GFX906-NEXT:    s_endpgm
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -302,124 +215,24 @@ define amdgpu_kernel void @v32i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1
 ; GFX906-LABEL: v32i8_liveout:
 ; GFX906:       ; %bb.0: ; %entry
 ; GFX906-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX906-NEXT:    v_lshlrev_b32_e32 v32, 5, v0
-; GFX906-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX906-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX906-NEXT:    v_lshlrev_b32_e32 v9, 5, v0
 ; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v0
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX906-NEXT:    global_load_dwordx4 v[5:8], v32, s[4:5]
-; GFX906-NEXT:    global_load_dwordx4 v[1:4], v32, s[4:5] offset:16
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v31, 8, v5
-; GFX906-NEXT:    v_lshrrev_b32_e32 v26, 16, v5
-; GFX906-NEXT:    v_lshrrev_b32_e32 v27, 24, v5
-; GFX906-NEXT:    v_lshrrev_b32_e32 v28, 8, v6
-; GFX906-NEXT:    v_lshrrev_b32_e32 v29, 16, v6
-; GFX906-NEXT:    v_lshrrev_b32_e32 v30, 24, v6
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v7
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v7
-; GFX906-NEXT:    v_lshrrev_b32_e32 v10, 24, v7
-; GFX906-NEXT:    v_lshrrev_b32_e32 v11, 8, v8
-; GFX906-NEXT:    v_lshrrev_b32_e32 v12, 16, v8
-; GFX906-NEXT:    v_lshrrev_b32_e32 v13, 24, v8
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v14, 8, v1
-; GFX906-NEXT:    v_lshrrev_b32_e32 v15, 16, v1
-; GFX906-NEXT:    v_lshrrev_b32_e32 v16, 24, v1
-; GFX906-NEXT:    v_lshrrev_b32_e32 v17, 8, v2
-; GFX906-NEXT:    v_lshrrev_b32_e32 v18, 16, v2
-; GFX906-NEXT:    v_lshrrev_b32_e32 v19, 24, v2
-; GFX906-NEXT:    v_lshrrev_b32_e32 v20, 8, v3
-; GFX906-NEXT:    v_lshrrev_b32_e32 v21, 16, v3
-; GFX906-NEXT:    v_lshrrev_b32_e32 v22, 24, v3
-; GFX906-NEXT:    v_lshrrev_b32_e32 v23, 8, v4
-; GFX906-NEXT:    v_lshrrev_b32_e32 v24, 16, v4
-; GFX906-NEXT:    v_lshrrev_b32_e32 v25, 24, v4
-; GFX906-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX906-NEXT:    global_load_dwordx4 v[1:4], v9, s[4:5]
+; GFX906-NEXT:    global_load_dwordx4 v[5:8], v9, s[4:5] offset:16
+; GFX906-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX906-NEXT:    s_cbranch_execz .LBB5_2
 ; GFX906-NEXT:  ; %bb.1: ; %bb.1
-; GFX906-NEXT:    global_load_dwordx4 v[5:8], v32, s[6:7]
-; GFX906-NEXT:    global_load_dwordx4 v[1:4], v32, s[6:7] offset:16
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v31, 8, v5
-; GFX906-NEXT:    v_lshrrev_b32_e32 v26, 16, v5
-; GFX906-NEXT:    v_lshrrev_b32_e32 v27, 24, v5
-; GFX906-NEXT:    v_lshrrev_b32_e32 v28, 8, v6
-; GFX906-NEXT:    v_lshrrev_b32_e32 v29, 16, v6
-; GFX906-NEXT:    v_lshrrev_b32_e32 v30, 24, v6
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v7
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v7
-; GFX906-NEXT:    v_lshrrev_b32_e32 v10, 24, v7
-; GFX906-NEXT:    v_lshrrev_b32_e32 v11, 8, v8
-; GFX906-NEXT:    v_lshrrev_b32_e32 v12, 16, v8
-; GFX906-NEXT:    v_lshrrev_b32_e32 v13, 24, v8
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v14, 8, v1
-; GFX906-NEXT:    v_lshrrev_b32_e32 v15, 16, v1
-; GFX906-NEXT:    v_lshrrev_b32_e32 v16, 24, v1
-; GFX906-NEXT:    v_lshrrev_b32_e32 v17, 8, v2
-; GFX906-NEXT:    v_lshrrev_b32_e32 v18, 16, v2
-; GFX906-NEXT:    v_lshrrev_b32_e32 v19, 24, v2
-; GFX906-NEXT:    v_lshrrev_b32_e32 v20, 8, v3
-; GFX906-NEXT:    v_lshrrev_b32_e32 v21, 16, v3
-; GFX906-NEXT:    v_lshrrev_b32_e32 v22, 24, v3
-; GFX906-NEXT:    v_lshrrev_b32_e32 v23, 8, v4
-; GFX906-NEXT:    v_lshrrev_b32_e32 v24, 16, v4
-; GFX906-NEXT:    v_lshrrev_b32_e32 v25, 24, v4
+; GFX906-NEXT:    global_load_dwordx4 v[1:4], v9, s[6:7]
+; GFX906-NEXT:    global_load_dwordx4 v[5:8], v9, s[6:7] offset:16
 ; GFX906-NEXT:  .LBB5_2: ; %bb.2
-; GFX906-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX906-NEXT:    v_mov_b32_e32 v32, 8
-; GFX906-NEXT:    v_mov_b32_e32 v33, 0xff
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v0, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    v_and_or_b32 v0, v7, v33, v0
-; GFX906-NEXT:    v_and_b32_e32 v7, 0xff, v9
-; GFX906-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX906-NEXT:    v_lshlrev_b32_e32 v9, 24, v10
-; GFX906-NEXT:    v_or3_b32 v7, v0, v7, v9
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v0, v32, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    v_and_or_b32 v0, v8, v33, v0
-; GFX906-NEXT:    v_and_b32_e32 v8, 0xff, v12
-; GFX906-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX906-NEXT:    v_lshlrev_b32_e32 v9, 24, v13
-; GFX906-NEXT:    v_or3_b32 v8, v0, v8, v9
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v0, v32, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    v_and_or_b32 v0, v1, v33, v0
-; GFX906-NEXT:    v_and_b32_e32 v1, 0xff, v15
-; GFX906-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX906-NEXT:    v_lshlrev_b32_e32 v9, 24, v16
-; GFX906-NEXT:    v_or3_b32 v0, v0, v1, v9
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v1, v32, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    v_and_or_b32 v1, v2, v33, v1
-; GFX906-NEXT:    v_and_b32_e32 v2, 0xff, v18
-; GFX906-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX906-NEXT:    v_lshlrev_b32_e32 v9, 24, v19
-; GFX906-NEXT:    v_or3_b32 v1, v1, v2, v9
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v2, v32, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    v_and_or_b32 v2, v3, v33, v2
-; GFX906-NEXT:    v_and_b32_e32 v3, 0xff, v21
-; GFX906-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX906-NEXT:    v_lshlrev_b32_e32 v9, 24, v22
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v28, v32, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    v_or3_b32 v2, v2, v3, v9
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v3, v32, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v31, v32, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    v_and_b32_e32 v26, 0xff, v26
-; GFX906-NEXT:    v_and_or_b32 v6, v6, v33, v28
-; GFX906-NEXT:    v_and_b32_e32 v28, 0xff, v29
-; GFX906-NEXT:    v_and_or_b32 v3, v4, v33, v3
-; GFX906-NEXT:    v_and_b32_e32 v4, 0xff, v24
-; GFX906-NEXT:    v_and_or_b32 v5, v5, v33, v31
-; GFX906-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
-; GFX906-NEXT:    v_lshlrev_b32_e32 v27, 24, v27
-; GFX906-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
-; GFX906-NEXT:    v_lshlrev_b32_e32 v29, 24, v30
-; GFX906-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX906-NEXT:    v_lshlrev_b32_e32 v9, 24, v25
-; GFX906-NEXT:    v_or3_b32 v5, v5, v26, v27
-; GFX906-NEXT:    v_or3_b32 v6, v6, v28, v29
-; GFX906-NEXT:    v_or3_b32 v3, v3, v4, v9
-; GFX906-NEXT:    v_mov_b32_e32 v4, 0
-; GFX906-NEXT:    global_store_dwordx4 v4, v[5:8], s[0:1]
-; GFX906-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
+; GFX906-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX906-NEXT:    v_mov_b32_e32 v0, 0
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    global_store_dwordx4 v0, v[1:4], s[2:3]
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    global_store_dwordx4 v0, v[5:8], s[2:3] offset:16
 ; GFX906-NEXT:    s_endpgm
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -450,1654 +263,148 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace(
 ; GFX906-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
 ; GFX906-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
 ; GFX906-NEXT:    s_addc_u32 s9, s9, 0
+; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v0
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX906-NEXT:    global_load_dwordx4 v[5:8], v4, s[4:5]
-; GFX906-NEXT:    global_load_dwordx4 v[9:12], v4, s[4:5] offset:16
-; GFX906-NEXT:    global_load_dwordx4 v[29:32], v4, s[4:5] offset:32
-; GFX906-NEXT:    global_load_dwordx4 v[13:16], v4, s[4:5] offset:48
-; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v0
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v5
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:780 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v5
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:692 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v6
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:768 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v6
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:712 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v6
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:716 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v7
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:772 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v7
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:736 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v7
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:740 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v8
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:776 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v8
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:744 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v8
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:748 ; 4-byte Folded Spill
-; GFX906-NEXT:    s_waitcnt vmcnt(13)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v9
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:696 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v9
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:564 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v9
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:568 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v10
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:700 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v10
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:572 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v10
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:576 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v11
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:704 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v11
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:588 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v11
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:592 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v12
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:708 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v12
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:596 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:16 ; 4-byte Folded Spill
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    buffer_store_dword v10, off, s[8:11], 0 offset:20 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v11, off, s[8:11], 0 offset:24 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v12, off, s[8:11], 0 offset:28 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v12
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:600 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v29
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:752 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v29
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:636 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v29
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:640 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v30
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:756 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v30
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:668 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v30
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:672 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v31
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:760 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v31
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:676 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v31
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:680 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v32
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:764 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v32
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:684 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v32
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:688 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v13
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:652 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v13
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:500 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v13
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:504 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v14
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:656 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v14
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:524 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v14
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:528 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v15
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:660 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v15
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:548 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v15
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:552 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v16
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:664 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v16
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:556 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v16
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:560 ; 4-byte Folded Spill
-; GFX906-NEXT:    global_load_dwordx4 v[0:3], v4, s[4:5] offset:64
-; GFX906-NEXT:    s_nop 0
-; GFX906-NEXT:    global_load_dwordx4 v[45:48], v4, s[4:5] offset:80
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v0
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:720 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v0
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:580 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v0
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:584 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v1
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:724 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:604 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v1
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:608 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v2
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:728 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v2
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:628 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v2
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:632 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v3
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:732 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v3
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:644 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v3
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:648 ; 4-byte Folded Spill
-; GFX906-NEXT:    s_waitcnt vmcnt(12)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v45
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:612 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v45
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:468 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v45
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:472 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v46
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:616 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v46
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:476 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v46
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:480 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v47
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:620 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v47
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:508 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v47
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:512 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v48
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:624 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v48
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:516 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v48
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:520 ; 4-byte Folded Spill
-; GFX906-NEXT:    global_load_dwordx4 v[57:60], v4, s[4:5] offset:96
-; GFX906-NEXT:    global_load_dwordx4 v[41:44], v4, s[4:5] offset:112
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v57
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:532 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v57
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:404 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v57
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:408 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v58
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:536 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v58
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:428 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v58
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:432 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v59
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:540 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v59
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:452 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v59
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:456 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v60
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:544 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v60
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:460 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v60
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:464 ; 4-byte Folded Spill
-; GFX906-NEXT:    s_waitcnt vmcnt(12)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v41
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:436 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v41
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:308 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v41
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:312 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v42
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:440 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v42
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:332 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v42
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:336 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v43
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:444 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v43
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:356 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v43
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:360 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v44
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:448 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v44
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:364 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v44
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:368 ; 4-byte Folded Spill
-; GFX906-NEXT:    global_load_dwordx4 v[53:56], v4, s[4:5] offset:128
-; GFX906-NEXT:    global_load_dwordx4 v[25:28], v4, s[4:5] offset:144
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v53
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:484 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v53
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:372 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v53
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:376 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v54
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:488 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v54
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:380 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v54
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:384 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v55
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:492 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v55
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:412 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v55
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:416 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v56
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:496 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v56
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:420 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v56
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:424 ; 4-byte Folded Spill
-; GFX906-NEXT:    s_waitcnt vmcnt(12)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v25
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:388 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v25
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:276 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v25
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:280 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v26
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:392 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v26
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:284 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v26
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:288 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v27
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:396 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v27
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:316 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v27
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:320 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v28
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:400 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v28
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:324 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v28
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:328 ; 4-byte Folded Spill
-; GFX906-NEXT:    global_load_dwordx4 v[49:52], v4, s[4:5] offset:160
-; GFX906-NEXT:    global_load_dwordx4 v[21:24], v4, s[4:5] offset:176
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v49
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:340 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v49
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:164 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v49
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:168 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v50
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:344 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v50
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:236 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v50
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:240 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v51
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:348 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v51
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:260 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v51
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:264 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v52
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:352 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v52
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:268 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v52
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:272 ; 4-byte Folded Spill
-; GFX906-NEXT:    s_waitcnt vmcnt(12)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v21
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:244 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v21
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:80 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v21
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:32 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v22
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:248 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v22
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:36 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v22
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:40 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v23
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:252 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v23
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:44 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v23
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:48 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v24
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:256 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v24
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:52 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v24
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:56 ; 4-byte Folded Spill
-; GFX906-NEXT:    global_load_dwordx4 v[33:36], v4, s[4:5] offset:192
-; GFX906-NEXT:    global_load_dwordx4 v[17:20], v4, s[4:5] offset:208
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v33
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:292 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v33
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:60 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v33
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:64 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v34
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:296 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v34
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:108 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v34
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:112 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v35
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:300 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v35
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:172 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v35
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:176 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v36
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:304 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v36
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:228 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v36
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:232 ; 4-byte Folded Spill
-; GFX906-NEXT:    s_waitcnt vmcnt(12)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v17
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:116 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v17
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:120 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v17
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:124 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v18
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:128 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v18
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:132 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v18
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:136 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v19
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:140 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v19
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:144 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v19
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:148 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v20
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:152 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v20
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:156 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v20
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:160 ; 4-byte Folded Spill
-; GFX906-NEXT:    global_load_dwordx4 v[37:40], v4, s[4:5] offset:224
-; GFX906-NEXT:    s_nop 0
-; GFX906-NEXT:    global_load_dwordx4 v[9:12], v4, s[4:5] offset:240
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 8, v37
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:180 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 16, v37
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:184 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 24, v37
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:188 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 8, v38
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:192 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 16, v38
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:196 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 24, v38
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:200 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 8, v39
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:204 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 16, v39
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:208 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 24, v39
-; GFX906-NEXT:    s_waitcnt vmcnt(8)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v62, 8, v9
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:212 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 8, v40
-; GFX906-NEXT:    buffer_store_dword v62, off, s[8:11], 0 offset:68 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v62, 8, v11
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:216 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 16, v40
-; GFX906-NEXT:    buffer_store_dword v62, off, s[8:11], 0 offset:84 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v62, 16, v11
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:220 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 24, v40
-; GFX906-NEXT:    buffer_store_dword v62, off, s[8:11], 0 offset:96 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v62, 24, v11
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:224 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 16, v9
-; GFX906-NEXT:    buffer_store_dword v62, off, s[8:11], 0 offset:92 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v62, 8, v12
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:88 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v63, 24, v9
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 16, v10
-; GFX906-NEXT:    buffer_store_dword v62, off, s[8:11], 0 offset:104 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v62, 16, v12
-; GFX906-NEXT:    buffer_store_dword v63, off, s[8:11], 0 offset:72 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v63, 8, v10
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:76 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 24, v10
-; GFX906-NEXT:    buffer_store_dword v62, off, s[8:11], 0 offset:100 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v5, off, s[8:11], 0 ; 4-byte Folded Spill
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    buffer_store_dword v10, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v11, off, s[8:11], 0 offset:8 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v12, off, s[8:11], 0 offset:12 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v62, 16, v5
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v12
+; GFX906-NEXT:    buffer_store_dword v6, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v7, off, s[8:11], 0 offset:8 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v8, off, s[8:11], 0 offset:12 ; 4-byte Folded Spill
+; GFX906-NEXT:    global_load_dwordx4 v[5:8], v4, s[4:5] offset:16
+; GFX906-NEXT:    s_nop 0
+; GFX906-NEXT:    global_load_dwordx4 v[9:12], v4, s[4:5] offset:32
+; GFX906-NEXT:    global_load_dwordx4 v[13:16], v4, s[4:5] offset:48
+; GFX906-NEXT:    global_load_dwordx4 v[17:20], v4, s[4:5] offset:64
+; GFX906-NEXT:    global_load_dwordx4 v[21:24], v4, s[4:5] offset:80
+; GFX906-NEXT:    global_load_dwordx4 v[25:28], v4, s[4:5] offset:96
+; GFX906-NEXT:    global_load_dwordx4 v[29:32], v4, s[4:5] offset:112
+; GFX906-NEXT:    global_load_dwordx4 v[33:36], v4, s[4:5] offset:128
+; GFX906-NEXT:    global_load_dwordx4 v[37:40], v4, s[4:5] offset:144
+; GFX906-NEXT:    global_load_dwordx4 v[41:44], v4, s[4:5] offset:160
+; GFX906-NEXT:    global_load_dwordx4 v[45:48], v4, s[4:5] offset:176
+; GFX906-NEXT:    global_load_dwordx4 v[49:52], v4, s[4:5] offset:192
+; GFX906-NEXT:    global_load_dwordx4 v[53:56], v4, s[4:5] offset:208
+; GFX906-NEXT:    global_load_dwordx4 v[57:60], v4, s[4:5] offset:224
+; GFX906-NEXT:    global_load_dwordx4 v[0:3], v4, s[4:5] offset:240
 ; GFX906-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX906-NEXT:    s_cbranch_execz .LBB6_2
 ; GFX906-NEXT:  ; %bb.1: ; %bb.1
-; GFX906-NEXT:    global_load_dwordx4 v[5:8], v4, s[6:7]
-; GFX906-NEXT:    global_load_dwordx4 v[0:3], v4, s[6:7] offset:16
-; GFX906-NEXT:    global_load_dwordx4 v[29:32], v4, s[6:7] offset:32
-; GFX906-NEXT:    global_load_dwordx4 v[13:16], v4, s[6:7] offset:48
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v5
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:780 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v5
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:692 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v6
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:768 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v6
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:712 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v6
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:716 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v7
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:772 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v7
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:736 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v7
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:740 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v8
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:776 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v8
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:744 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v8
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:748 ; 4-byte Folded Spill
-; GFX906-NEXT:    s_waitcnt vmcnt(13)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v0
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:696 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v0
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:564 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v0
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:568 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v1
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:700 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:572 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v1
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:576 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v2
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:704 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v2
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:588 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v2
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:592 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v3
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:708 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v3
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:596 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:16 ; 4-byte Folded Spill
+; GFX906-NEXT:    global_load_dwordx4 v[0:3], v4, s[6:7]
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    buffer_store_dword v1, off, s[8:11], 0 offset:20 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:24 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:28 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v62, 16, v5
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v3
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:600 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v29
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:752 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v29
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:636 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v29
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:640 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v30
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:756 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v30
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:668 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v30
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:672 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v31
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:760 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v31
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:676 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v31
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:680 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v32
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:764 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v32
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:684 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v32
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:688 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v13
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:652 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v13
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:500 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v13
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:504 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v14
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:656 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v14
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:524 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v14
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:528 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v15
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:660 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v15
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:548 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v15
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:552 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v16
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:664 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v16
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:556 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v16
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:560 ; 4-byte Folded Spill
-; GFX906-NEXT:    global_load_dwordx4 v[0:3], v4, s[6:7] offset:64
-; GFX906-NEXT:    s_nop 0
-; GFX906-NEXT:    global_load_dwordx4 v[45:48], v4, s[6:7] offset:80
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v0
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:720 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v0
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:580 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v0
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:584 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v1
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:724 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:604 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v1
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:608 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v2
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:728 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v2
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:628 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v2
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:632 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v3
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:732 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v3
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:644 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v3
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:648 ; 4-byte Folded Spill
-; GFX906-NEXT:    s_waitcnt vmcnt(12)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v45
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:612 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v45
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:468 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v45
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:472 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v46
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:616 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v46
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:476 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v46
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:480 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v47
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:620 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v47
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:508 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v47
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:512 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v48
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:624 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v48
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:516 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v48
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:520 ; 4-byte Folded Spill
-; GFX906-NEXT:    global_load_dwordx4 v[57:60], v4, s[6:7] offset:96
-; GFX906-NEXT:    global_load_dwordx4 v[41:44], v4, s[6:7] offset:112
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v57
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:532 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v57
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:404 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v57
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:408 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v58
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:536 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v58
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:428 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v58
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:432 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v59
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:540 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v59
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:452 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v59
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:456 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v60
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:544 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v60
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:460 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v60
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:464 ; 4-byte Folded Spill
-; GFX906-NEXT:    s_waitcnt vmcnt(12)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v41
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:436 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v41
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:308 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v41
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:312 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v42
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:440 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v42
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:332 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v42
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:336 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v43
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:444 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v43
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:356 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v43
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:360 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v44
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:448 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v44
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:364 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v44
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:368 ; 4-byte Folded Spill
-; GFX906-NEXT:    global_load_dwordx4 v[53:56], v4, s[6:7] offset:128
-; GFX906-NEXT:    global_load_dwordx4 v[25:28], v4, s[6:7] offset:144
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v53
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:484 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v53
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:372 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v53
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:376 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v54
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:488 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v54
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:380 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v54
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:384 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v55
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:492 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v55
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:412 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v55
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:416 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v56
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:496 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v56
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:420 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v56
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:424 ; 4-byte Folded Spill
-; GFX906-NEXT:    s_waitcnt vmcnt(12)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v25
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:388 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v25
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:276 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v25
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:280 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v26
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:392 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v26
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:284 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v26
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:288 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v27
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:396 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v27
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:316 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v27
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:320 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v28
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:400 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v28
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:324 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v28
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:328 ; 4-byte Folded Spill
-; GFX906-NEXT:    global_load_dwordx4 v[49:52], v4, s[6:7] offset:160
-; GFX906-NEXT:    global_load_dwordx4 v[21:24], v4, s[6:7] offset:176
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v49
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:340 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v49
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:164 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v49
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:168 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v50
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:344 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v50
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:236 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v50
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:240 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v51
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:348 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v51
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:260 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v51
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:264 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v52
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:352 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v52
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:268 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v52
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:272 ; 4-byte Folded Spill
-; GFX906-NEXT:    s_waitcnt vmcnt(12)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v21
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:244 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v21
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:80 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v21
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:32 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v22
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:248 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v22
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:36 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v22
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:40 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v23
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:252 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v23
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:44 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v23
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:48 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v24
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:256 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v24
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:52 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v24
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:56 ; 4-byte Folded Spill
-; GFX906-NEXT:    global_load_dwordx4 v[33:36], v4, s[6:7] offset:192
-; GFX906-NEXT:    global_load_dwordx4 v[17:20], v4, s[6:7] offset:208
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v33
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:292 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v33
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:60 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v33
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:64 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v34
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:296 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v34
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:108 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v34
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:112 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v35
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:300 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v35
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:172 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v35
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:176 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v36
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:304 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v36
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:228 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v36
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:232 ; 4-byte Folded Spill
-; GFX906-NEXT:    s_waitcnt vmcnt(12)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v17
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:116 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v17
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:120 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v17
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:124 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v18
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:128 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v18
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:132 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v18
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:136 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v19
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:140 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v19
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:144 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v19
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:148 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v20
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:152 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v20
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:156 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v20
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:160 ; 4-byte Folded Spill
-; GFX906-NEXT:    global_load_dwordx4 v[37:40], v4, s[6:7] offset:224
-; GFX906-NEXT:    s_nop 0
-; GFX906-NEXT:    global_load_dwordx4 v[9:12], v4, s[6:7] offset:240
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 8, v37
-; GFX906-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:180 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 16, v37
-; GFX906-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:184 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 24, v37
-; GFX906-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:188 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 8, v38
-; GFX906-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:192 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 16, v38
-; GFX906-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:196 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 24, v38
-; GFX906-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:200 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 8, v39
-; GFX906-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:204 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 16, v39
-; GFX906-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:208 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 24, v39
-; GFX906-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:212 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 8, v40
-; GFX906-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:216 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 16, v40
-; GFX906-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:220 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 24, v40
-; GFX906-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:224 ; 4-byte Folded Spill
-; GFX906-NEXT:    s_waitcnt vmcnt(12)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 8, v9
-; GFX906-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:68 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 16, v9
-; GFX906-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:88 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 24, v9
-; GFX906-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:72 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 16, v10
-; GFX906-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:76 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 8, v11
-; GFX906-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:84 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 16, v11
-; GFX906-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:96 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 24, v11
-; GFX906-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:92 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 8, v12
-; GFX906-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:104 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 16, v12
-; GFX906-NEXT:    v_lshrrev_b32_e32 v63, 8, v10
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 24, v10
-; GFX906-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:100 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v9, off, s[8:11], 0 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 ; 4-byte Folded Spill
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    buffer_store_dword v10, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v11, off, s[8:11], 0 offset:8 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v12, off, s[8:11], 0 offset:12 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v12
+; GFX906-NEXT:    buffer_store_dword v1, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:8 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:12 ; 4-byte Folded Spill
+; GFX906-NEXT:    global_load_dwordx4 v[5:8], v4, s[6:7] offset:16
+; GFX906-NEXT:    global_load_dwordx4 v[9:12], v4, s[6:7] offset:32
+; GFX906-NEXT:    global_load_dwordx4 v[13:16], v4, s[6:7] offset:48
+; GFX906-NEXT:    global_load_dwordx4 v[17:20], v4, s[6:7] offset:64
+; GFX906-NEXT:    global_load_dwordx4 v[21:24], v4, s[6:7] offset:80
+; GFX906-NEXT:    global_load_dwordx4 v[25:28], v4, s[6:7] offset:96
+; GFX906-NEXT:    global_load_dwordx4 v[29:32], v4, s[6:7] offset:112
+; GFX906-NEXT:    global_load_dwordx4 v[33:36], v4, s[6:7] offset:128
+; GFX906-NEXT:    global_load_dwordx4 v[37:40], v4, s[6:7] offset:144
+; GFX906-NEXT:    global_load_dwordx4 v[41:44], v4, s[6:7] offset:160
+; GFX906-NEXT:    global_load_dwordx4 v[45:48], v4, s[6:7] offset:176
+; GFX906-NEXT:    global_load_dwordx4 v[49:52], v4, s[6:7] offset:192
+; GFX906-NEXT:    global_load_dwordx4 v[53:56], v4, s[6:7] offset:208
+; GFX906-NEXT:    global_load_dwordx4 v[57:60], v4, s[6:7] offset:224
+; GFX906-NEXT:    global_load_dwordx4 v[0:3], v4, s[6:7] offset:240
 ; GFX906-NEXT:  .LBB6_2: ; %bb.2
 ; GFX906-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:784 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v63, off, s[8:11], 0 offset:788 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:792 ; 4-byte Folded Spill
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    buffer_store_dword v18, off, s[8:11], 0 offset:796 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v19, off, s[8:11], 0 offset:800 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v20, off, s[8:11], 0 offset:804 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_load_dword v10, off, s[8:11], 0 offset:780 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_mov_b32_e32 v4, 8
-; GFX906-NEXT:    v_mov_b32_e32 v63, 0xff
-; GFX906-NEXT:    v_mov_b32_e32 v18, v16
-; GFX906-NEXT:    v_mov_b32_e32 v17, v15
-; GFX906-NEXT:    v_mov_b32_e32 v16, v14
-; GFX906-NEXT:    v_mov_b32_e32 v15, v13
-; GFX906-NEXT:    v_mov_b32_e32 v19, v9
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v61, v4, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    buffer_load_dword v10, off, s[8:11], 0 offset:768 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_and_or_b32 v5, v5, v63, v61
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v61, v4, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    buffer_load_dword v10, off, s[8:11], 0 offset:772 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_and_or_b32 v6, v6, v63, v61
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v61, v4, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    buffer_load_dword v10, off, s[8:11], 0 offset:776 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_and_or_b32 v7, v7, v63, v61
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v61, v4, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    buffer_load_dword v10, off, s[8:11], 0 offset:692 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v11, off, s[8:11], 0 offset:16 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v12, off, s[8:11], 0 offset:20 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v13, off, s[8:11], 0 offset:24 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v14, off, s[8:11], 0 offset:28 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v9, off, s[8:11], 0 offset:712 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_and_or_b32 v8, v8, v63, v61
-; GFX906-NEXT:    v_and_b32_e32 v61, 0xff, v62
-; GFX906-NEXT:    v_lshlrev_b32_e32 v61, 16, v61
-; GFX906-NEXT:    s_waitcnt vmcnt(5)
-; GFX906-NEXT:    v_lshlrev_b32_e32 v62, 24, v10
-; GFX906-NEXT:    buffer_load_dword v10, off, s[8:11], 0 offset:716 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_and_b32_e32 v9, 0xff, v9
-; GFX906-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX906-NEXT:    v_or3_b32 v5, v5, v61, v62
-; GFX906-NEXT:    v_mov_b32_e32 v61, 0
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshlrev_b32_e32 v10, 24, v10
-; GFX906-NEXT:    v_or3_b32 v6, v6, v9, v10
-; GFX906-NEXT:    buffer_load_dword v9, off, s[8:11], 0 offset:736 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v10, off, s[8:11], 0 offset:740 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_and_b32_e32 v9, 0xff, v9
-; GFX906-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshlrev_b32_e32 v10, 24, v10
-; GFX906-NEXT:    v_or3_b32 v7, v7, v9, v10
-; GFX906-NEXT:    buffer_load_dword v9, off, s[8:11], 0 offset:744 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v10, off, s[8:11], 0 offset:748 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_and_b32_e32 v9, 0xff, v9
-; GFX906-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshlrev_b32_e32 v10, 24, v10
-; GFX906-NEXT:    v_or3_b32 v8, v8, v9, v10
-; GFX906-NEXT:    global_store_dwordx4 v61, v[5:8], s[2:3]
-; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:696 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_nop 0
-; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:700 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v9, off, s[8:11], 0 offset:564 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v10, off, s[8:11], 0 offset:568 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v7, off, s[8:11], 0 offset:704 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v8, off, s[8:11], 0 offset:708 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(5)
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v5, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    v_and_or_b32 v5, v11, v63, v5
-; GFX906-NEXT:    s_waitcnt vmcnt(4)
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v6, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    v_and_or_b32 v6, v12, v63, v6
-; GFX906-NEXT:    buffer_load_dword v11, off, s[8:11], 0 offset:572 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v12, off, s[8:11], 0 offset:576 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(5)
-; GFX906-NEXT:    v_and_b32_e32 v9, 0xff, v9
-; GFX906-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX906-NEXT:    s_waitcnt vmcnt(4)
-; GFX906-NEXT:    v_lshlrev_b32_e32 v10, 24, v10
-; GFX906-NEXT:    v_or3_b32 v5, v5, v9, v10
-; GFX906-NEXT:    buffer_load_dword v9, off, s[8:11], 0 offset:588 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v10, off, s[8:11], 0 offset:592 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(5)
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v7, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    v_and_or_b32 v7, v13, v63, v7
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_and_b32_e32 v11, 0xff, v11
-; GFX906-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_lshlrev_b32_e32 v12, 24, v12
-; GFX906-NEXT:    v_or3_b32 v6, v6, v11, v12
-; GFX906-NEXT:    buffer_load_dword v11, off, s[8:11], 0 offset:668 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v8, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_and_b32_e32 v9, 0xff, v9
-; GFX906-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b32_e32 v10, 24, v10
-; GFX906-NEXT:    v_or3_b32 v7, v7, v9, v10
-; GFX906-NEXT:    buffer_load_dword v9, off, s[8:11], 0 offset:596 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v10, off, s[8:11], 0 offset:600 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v12, off, s[8:11], 0 offset:672 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_and_or_b32 v8, v14, v63, v8
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_and_b32_e32 v11, 0xff, v11
-; GFX906-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_and_b32_e32 v9, 0xff, v9
-; GFX906-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b32_e32 v10, 24, v10
-; GFX906-NEXT:    v_or3_b32 v8, v8, v9, v10
-; GFX906-NEXT:    global_store_dwordx4 v61, v[5:8], s[2:3] offset:16
-; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:752 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_lshlrev_b32_e32 v12, 24, v12
-; GFX906-NEXT:    buffer_load_dword v7, off, s[8:11], 0 offset:760 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v9, off, s[8:11], 0 offset:636 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v10, off, s[8:11], 0 offset:640 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:756 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v8, off, s[8:11], 0 offset:764 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(5)
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v5, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    v_and_or_b32 v5, v29, v63, v5
-; GFX906-NEXT:    s_waitcnt vmcnt(4)
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v7, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_and_b32_e32 v9, 0xff, v9
-; GFX906-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_lshlrev_b32_e32 v10, 24, v10
-; GFX906-NEXT:    v_or3_b32 v5, v5, v9, v10
-; GFX906-NEXT:    buffer_load_dword v9, off, s[8:11], 0 offset:676 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v10, off, s[8:11], 0 offset:680 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_and_or_b32 v7, v31, v63, v7
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v6, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    v_and_or_b32 v6, v30, v63, v6
-; GFX906-NEXT:    v_or3_b32 v6, v6, v11, v12
-; GFX906-NEXT:    buffer_load_dword v11, off, s[8:11], 0 offset:524 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_and_b32_e32 v9, 0xff, v9
-; GFX906-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b32_e32 v10, 24, v10
-; GFX906-NEXT:    v_or3_b32 v7, v7, v9, v10
-; GFX906-NEXT:    buffer_load_dword v9, off, s[8:11], 0 offset:684 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v10, off, s[8:11], 0 offset:688 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v8, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    v_and_or_b32 v8, v32, v63, v8
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_and_b32_e32 v9, 0xff, v9
-; GFX906-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshlrev_b32_e32 v10, 24, v10
-; GFX906-NEXT:    v_or3_b32 v8, v8, v9, v10
-; GFX906-NEXT:    global_store_dwordx4 v61, v[5:8], s[2:3] offset:32
-; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:652 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_nop 0
-; GFX906-NEXT:    buffer_load_dword v7, off, s[8:11], 0 offset:660 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v9, off, s[8:11], 0 offset:500 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v10, off, s[8:11], 0 offset:504 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v12, off, s[8:11], 0 offset:528 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:656 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v8, off, s[8:11], 0 offset:664 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_and_b32_e32 v11, 0xff, v11
-; GFX906-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX906-NEXT:    s_waitcnt vmcnt(6)
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v5, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    v_and_or_b32 v5, v15, v63, v5
-; GFX906-NEXT:    s_waitcnt vmcnt(5)
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v7, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    s_waitcnt vmcnt(4)
-; GFX906-NEXT:    v_and_b32_e32 v9, 0xff, v9
-; GFX906-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_lshlrev_b32_e32 v10, 24, v10
-; GFX906-NEXT:    v_or3_b32 v5, v5, v9, v10
-; GFX906-NEXT:    buffer_load_dword v9, off, s[8:11], 0 offset:548 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v10, off, s[8:11], 0 offset:552 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_and_or_b32 v7, v17, v63, v7
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v6, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    v_and_or_b32 v6, v16, v63, v6
-; GFX906-NEXT:    v_lshlrev_b32_e32 v12, 24, v12
-; GFX906-NEXT:    v_or3_b32 v6, v6, v11, v12
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_and_b32_e32 v9, 0xff, v9
-; GFX906-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshlrev_b32_e32 v10, 24, v10
-; GFX906-NEXT:    v_or3_b32 v7, v7, v9, v10
-; GFX906-NEXT:    buffer_load_dword v9, off, s[8:11], 0 offset:556 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v10, off, s[8:11], 0 offset:560 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v8, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    v_and_or_b32 v8, v18, v63, v8
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_and_b32_e32 v9, 0xff, v9
-; GFX906-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshlrev_b32_e32 v10, 24, v10
-; GFX906-NEXT:    v_or3_b32 v8, v8, v9, v10
-; GFX906-NEXT:    global_store_dwordx4 v61, v[5:8], s[2:3] offset:48
-; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:720 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v5, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    v_and_or_b32 v0, v0, v63, v5
-; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:724 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:584 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v7, off, s[8:11], 0 offset:604 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v8, off, s[8:11], 0 offset:608 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v5, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    v_and_or_b32 v1, v1, v63, v5
-; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:728 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b32_e32 v8, 24, v8
-; GFX906-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX906-NEXT:    v_or3_b32 v1, v1, v7, v8
-; GFX906-NEXT:    buffer_load_dword v7, off, s[8:11], 0 offset:476 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v5, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    v_and_or_b32 v2, v2, v63, v5
-; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:732 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v5, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    v_and_or_b32 v3, v3, v63, v5
-; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:580 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX906-NEXT:    v_or3_b32 v0, v0, v5, v6
-; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:628 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:632 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; GFX906-NEXT:    v_or3_b32 v2, v2, v5, v6
-; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:644 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:648 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; GFX906-NEXT:    v_or3_b32 v3, v3, v5, v6
-; GFX906-NEXT:    global_store_dwordx4 v61, v[0:3], s[2:3] offset:64
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:612 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_nop 0
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:620 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:468 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:472 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v8, off, s[8:11], 0 offset:480 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:616 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:624 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX906-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX906-NEXT:    s_waitcnt vmcnt(6)
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    v_and_or_b32 v0, v45, v63, v0
-; GFX906-NEXT:    s_waitcnt vmcnt(5)
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    s_waitcnt vmcnt(4)
-; GFX906-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; GFX906-NEXT:    v_or3_b32 v0, v0, v5, v6
-; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:508 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:512 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_and_or_b32 v2, v47, v63, v2
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    v_and_or_b32 v1, v46, v63, v1
-; GFX906-NEXT:    v_lshlrev_b32_e32 v8, 24, v8
-; GFX906-NEXT:    v_or3_b32 v1, v1, v7, v8
-; GFX906-NEXT:    buffer_load_dword v7, off, s[8:11], 0 offset:428 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; GFX906-NEXT:    v_or3_b32 v2, v2, v5, v6
-; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:516 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:520 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    v_and_or_b32 v3, v48, v63, v3
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; GFX906-NEXT:    v_or3_b32 v3, v3, v5, v6
-; GFX906-NEXT:    global_store_dwordx4 v61, v[0:3], s[2:3] offset:80
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:532 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_nop 0
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:540 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:404 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:408 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v8, off, s[8:11], 0 offset:432 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:536 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:544 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX906-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX906-NEXT:    s_waitcnt vmcnt(6)
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    v_and_or_b32 v0, v57, v63, v0
-; GFX906-NEXT:    s_waitcnt vmcnt(5)
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    s_waitcnt vmcnt(4)
-; GFX906-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; GFX906-NEXT:    v_or3_b32 v0, v0, v5, v6
-; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:452 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:456 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_and_or_b32 v2, v59, v63, v2
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    v_and_or_b32 v1, v58, v63, v1
-; GFX906-NEXT:    v_lshlrev_b32_e32 v8, 24, v8
-; GFX906-NEXT:    v_or3_b32 v1, v1, v7, v8
-; GFX906-NEXT:    buffer_load_dword v7, off, s[8:11], 0 offset:332 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; GFX906-NEXT:    v_or3_b32 v2, v2, v5, v6
-; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:460 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:464 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    v_and_or_b32 v3, v60, v63, v3
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; GFX906-NEXT:    v_or3_b32 v3, v3, v5, v6
-; GFX906-NEXT:    global_store_dwordx4 v61, v[0:3], s[2:3] offset:96
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:436 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_nop 0
-; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:440 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:308 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:312 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v8, off, s[8:11], 0 offset:336 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:444 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:448 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX906-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX906-NEXT:    s_waitcnt vmcnt(6)
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    v_and_or_b32 v0, v41, v63, v0
-; GFX906-NEXT:    s_waitcnt vmcnt(5)
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    s_waitcnt vmcnt(4)
-; GFX906-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; GFX906-NEXT:    v_or3_b32 v0, v0, v5, v6
-; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:356 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:360 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_and_or_b32 v1, v42, v63, v1
-; GFX906-NEXT:    s_waitcnt vmcnt(4)
-; GFX906-NEXT:    v_lshlrev_b32_e32 v8, 24, v8
-; GFX906-NEXT:    v_or3_b32 v1, v1, v7, v8
-; GFX906-NEXT:    buffer_load_dword v7, off, s[8:11], 0 offset:380 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(4)
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    v_and_or_b32 v2, v43, v63, v2
-; GFX906-NEXT:    buffer_load_dword v8, off, s[8:11], 0 offset:384 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(4)
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    v_and_or_b32 v3, v44, v63, v3
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; GFX906-NEXT:    v_or3_b32 v2, v2, v5, v6
-; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:364 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:368 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX906-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_lshlrev_b32_e32 v8, 24, v8
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; GFX906-NEXT:    v_or3_b32 v3, v3, v5, v6
-; GFX906-NEXT:    global_store_dwordx4 v61, v[0:3], s[2:3] offset:112
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:484 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_nop 0
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:492 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:372 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:376 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:488 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:496 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(5)
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    v_and_or_b32 v0, v53, v63, v0
-; GFX906-NEXT:    s_waitcnt vmcnt(4)
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; GFX906-NEXT:    v_or3_b32 v0, v0, v5, v6
-; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:412 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:416 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_and_or_b32 v2, v55, v63, v2
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    v_and_or_b32 v1, v54, v63, v1
-; GFX906-NEXT:    v_or3_b32 v1, v1, v7, v8
-; GFX906-NEXT:    buffer_load_dword v7, off, s[8:11], 0 offset:284 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; GFX906-NEXT:    v_or3_b32 v2, v2, v5, v6
-; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:420 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:424 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    v_and_or_b32 v3, v56, v63, v3
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; GFX906-NEXT:    v_or3_b32 v3, v3, v5, v6
-; GFX906-NEXT:    global_store_dwordx4 v61, v[0:3], s[2:3] offset:128
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:388 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_nop 0
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:396 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:276 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:280 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v8, off, s[8:11], 0 offset:288 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:392 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:400 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX906-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX906-NEXT:    s_waitcnt vmcnt(6)
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    v_and_or_b32 v0, v25, v63, v0
-; GFX906-NEXT:    s_waitcnt vmcnt(5)
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    s_waitcnt vmcnt(4)
-; GFX906-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; GFX906-NEXT:    v_or3_b32 v0, v0, v5, v6
-; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:316 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:320 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_and_or_b32 v2, v27, v63, v2
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    v_and_or_b32 v1, v26, v63, v1
-; GFX906-NEXT:    v_lshlrev_b32_e32 v8, 24, v8
-; GFX906-NEXT:    v_or3_b32 v1, v1, v7, v8
-; GFX906-NEXT:    buffer_load_dword v7, off, s[8:11], 0 offset:236 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; GFX906-NEXT:    v_or3_b32 v2, v2, v5, v6
-; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:324 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:328 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    v_and_or_b32 v3, v28, v63, v3
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; GFX906-NEXT:    v_or3_b32 v3, v3, v5, v6
-; GFX906-NEXT:    global_store_dwordx4 v61, v[0:3], s[2:3] offset:144
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:340 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_nop 0
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:348 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:164 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:168 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v8, off, s[8:11], 0 offset:240 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:344 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:352 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(6)
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    v_and_or_b32 v0, v49, v63, v0
-; GFX906-NEXT:    s_waitcnt vmcnt(5)
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    s_waitcnt vmcnt(4)
-; GFX906-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; GFX906-NEXT:    v_or3_b32 v0, v0, v5, v6
-; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:260 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:264 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_and_or_b32 v2, v51, v63, v2
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    v_and_or_b32 v1, v50, v63, v1
-; GFX906-NEXT:    v_lshlrev_b32_e32 v8, 24, v8
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; GFX906-NEXT:    v_or3_b32 v2, v2, v5, v6
-; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:268 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:272 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    v_and_or_b32 v3, v52, v63, v3
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; GFX906-NEXT:    v_or3_b32 v3, v3, v5, v6
-; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:80 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX906-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX906-NEXT:    v_or3_b32 v1, v1, v7, v8
-; GFX906-NEXT:    buffer_load_dword v8, off, s[8:11], 0 offset:40 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:32 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v7, off, s[8:11], 0 offset:36 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX906-NEXT:    global_store_dwordx4 v61, v[0:3], s[2:3] offset:160
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:244 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:248 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(4)
-; GFX906-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:252 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(4)
-; GFX906-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX906-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX906-NEXT:    v_lshlrev_b32_e32 v8, 24, v8
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:256 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    v_and_or_b32 v0, v21, v63, v0
-; GFX906-NEXT:    v_or3_b32 v0, v0, v5, v6
-; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:44 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:48 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(4)
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    v_and_or_b32 v1, v22, v63, v1
-; GFX906-NEXT:    v_or3_b32 v1, v1, v7, v8
-; GFX906-NEXT:    buffer_load_dword v7, off, s[8:11], 0 offset:108 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(4)
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    v_and_or_b32 v2, v23, v63, v2
-; GFX906-NEXT:    buffer_load_dword v8, off, s[8:11], 0 offset:112 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(4)
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    v_and_or_b32 v3, v24, v63, v3
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; GFX906-NEXT:    v_or3_b32 v2, v2, v5, v6
-; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:52 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:56 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX906-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_lshlrev_b32_e32 v8, 24, v8
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; GFX906-NEXT:    v_or3_b32 v3, v3, v5, v6
-; GFX906-NEXT:    global_store_dwordx4 v61, v[0:3], s[2:3] offset:176
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:292 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_nop 0
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:300 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:60 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:64 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:296 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:304 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(5)
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    v_and_or_b32 v0, v33, v63, v0
-; GFX906-NEXT:    s_waitcnt vmcnt(4)
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; GFX906-NEXT:    v_or3_b32 v0, v0, v5, v6
-; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:172 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:176 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_and_or_b32 v2, v35, v63, v2
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    v_and_or_b32 v1, v34, v63, v1
-; GFX906-NEXT:    v_or3_b32 v1, v1, v7, v8
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; GFX906-NEXT:    v_or3_b32 v2, v2, v5, v6
-; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:228 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:232 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    v_and_or_b32 v3, v36, v63, v3
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; GFX906-NEXT:    v_or3_b32 v3, v3, v5, v6
-; GFX906-NEXT:    global_store_dwordx4 v61, v[0:3], s[2:3] offset:192
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:116 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_nop 0
-; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:792 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:796 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v7, off, s[8:11], 0 offset:800 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v8, off, s[8:11], 0 offset:804 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:120 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:124 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:136 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(7)
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_and_or_b32 v0, v5, v63, v0
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX906-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
-; GFX906-NEXT:    v_or3_b32 v0, v0, v1, v2
-; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:128 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:132 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:148 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX906-NEXT:    v_and_or_b32 v1, v6, v63, v1
-; GFX906-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX906-NEXT:    v_or3_b32 v1, v1, v2, v3
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:140 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:144 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:160 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX906-NEXT:    v_and_or_b32 v2, v7, v63, v2
-; GFX906-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX906-NEXT:    v_or3_b32 v2, v2, v3, v5
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:152 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:156 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX906-NEXT:    v_and_or_b32 v3, v8, v63, v3
-; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX906-NEXT:    v_or3_b32 v3, v3, v5, v6
-; GFX906-NEXT:    global_store_dwordx4 v61, v[0:3], s[2:3] offset:208
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:180 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_nop 0
-; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:184 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:188 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:200 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:224 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:212 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(5)
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    v_and_or_b32 v0, v37, v63, v0
-; GFX906-NEXT:    s_waitcnt vmcnt(4)
-; GFX906-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX906-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
-; GFX906-NEXT:    v_or3_b32 v0, v0, v1, v2
-; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:192 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:196 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(4)
-; GFX906-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
-; GFX906-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX906-NEXT:    v_and_or_b32 v1, v38, v63, v1
-; GFX906-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX906-NEXT:    v_or3_b32 v1, v1, v2, v3
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:204 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:208 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX906-NEXT:    v_and_or_b32 v2, v39, v63, v2
-; GFX906-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX906-NEXT:    v_or3_b32 v2, v2, v3, v5
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:216 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:220 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:16 ; 4-byte Folded Spill
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX906-NEXT:    v_and_or_b32 v3, v40, v63, v3
-; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX906-NEXT:    v_or3_b32 v3, v3, v5, v6
-; GFX906-NEXT:    global_store_dwordx4 v61, v[0:3], s[2:3] offset:224
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:68 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_nop 0
+; GFX906-NEXT:    buffer_store_dword v1, off, s[8:11], 0 offset:20 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:24 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:28 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_mov_b32_e32 v0, v57
+; GFX906-NEXT:    v_mov_b32_e32 v1, v58
+; GFX906-NEXT:    v_mov_b32_e32 v2, v59
+; GFX906-NEXT:    v_mov_b32_e32 v3, v60
+; GFX906-NEXT:    v_mov_b32_e32 v60, v56
+; GFX906-NEXT:    v_mov_b32_e32 v59, v55
+; GFX906-NEXT:    v_mov_b32_e32 v58, v54
+; GFX906-NEXT:    v_mov_b32_e32 v57, v53
+; GFX906-NEXT:    v_mov_b32_e32 v56, v52
+; GFX906-NEXT:    v_mov_b32_e32 v55, v51
+; GFX906-NEXT:    v_mov_b32_e32 v54, v50
+; GFX906-NEXT:    v_mov_b32_e32 v53, v49
+; GFX906-NEXT:    v_mov_b32_e32 v52, v48
+; GFX906-NEXT:    v_mov_b32_e32 v51, v47
+; GFX906-NEXT:    v_mov_b32_e32 v50, v46
+; GFX906-NEXT:    v_mov_b32_e32 v49, v45
+; GFX906-NEXT:    v_mov_b32_e32 v48, v44
+; GFX906-NEXT:    v_mov_b32_e32 v47, v43
+; GFX906-NEXT:    v_mov_b32_e32 v46, v42
+; GFX906-NEXT:    v_mov_b32_e32 v45, v41
+; GFX906-NEXT:    v_mov_b32_e32 v44, v40
+; GFX906-NEXT:    v_mov_b32_e32 v43, v39
+; GFX906-NEXT:    v_mov_b32_e32 v42, v38
+; GFX906-NEXT:    v_mov_b32_e32 v41, v37
+; GFX906-NEXT:    v_mov_b32_e32 v40, v36
+; GFX906-NEXT:    v_mov_b32_e32 v39, v35
+; GFX906-NEXT:    v_mov_b32_e32 v38, v34
+; GFX906-NEXT:    v_mov_b32_e32 v37, v33
+; GFX906-NEXT:    v_mov_b32_e32 v36, v32
+; GFX906-NEXT:    v_mov_b32_e32 v35, v31
+; GFX906-NEXT:    v_mov_b32_e32 v34, v30
+; GFX906-NEXT:    v_mov_b32_e32 v33, v29
+; GFX906-NEXT:    v_mov_b32_e32 v32, v28
+; GFX906-NEXT:    v_mov_b32_e32 v31, v27
+; GFX906-NEXT:    v_mov_b32_e32 v30, v26
+; GFX906-NEXT:    v_mov_b32_e32 v29, v25
+; GFX906-NEXT:    v_mov_b32_e32 v28, v24
+; GFX906-NEXT:    v_mov_b32_e32 v27, v23
+; GFX906-NEXT:    v_mov_b32_e32 v26, v22
+; GFX906-NEXT:    v_mov_b32_e32 v25, v21
+; GFX906-NEXT:    v_mov_b32_e32 v24, v20
+; GFX906-NEXT:    v_mov_b32_e32 v23, v19
+; GFX906-NEXT:    v_mov_b32_e32 v22, v18
+; GFX906-NEXT:    v_mov_b32_e32 v21, v17
+; GFX906-NEXT:    v_mov_b32_e32 v20, v16
+; GFX906-NEXT:    v_mov_b32_e32 v19, v15
+; GFX906-NEXT:    v_mov_b32_e32 v18, v14
+; GFX906-NEXT:    v_mov_b32_e32 v17, v13
+; GFX906-NEXT:    v_mov_b32_e32 v16, v12
+; GFX906-NEXT:    v_mov_b32_e32 v15, v11
+; GFX906-NEXT:    v_mov_b32_e32 v14, v10
+; GFX906-NEXT:    v_mov_b32_e32 v13, v9
+; GFX906-NEXT:    v_mov_b32_e32 v12, v8
+; GFX906-NEXT:    v_mov_b32_e32 v11, v7
+; GFX906-NEXT:    v_mov_b32_e32 v10, v6
+; GFX906-NEXT:    v_mov_b32_e32 v9, v5
 ; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 ; 4-byte Folded Reload
 ; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:4 ; 4-byte Folded Reload
 ; GFX906-NEXT:    buffer_load_dword v7, off, s[8:11], 0 offset:8 ; 4-byte Folded Reload
 ; GFX906-NEXT:    buffer_load_dword v8, off, s[8:11], 0 offset:12 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:88 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:784 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:72 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(7)
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_and_or_b32 v0, v5, v63, v0
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX906-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
-; GFX906-NEXT:    v_or3_b32 v0, v0, v1, v2
-; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:788 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:92 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:76 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    v_and_or_b32 v1, v6, v63, v1
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX906-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX906-NEXT:    v_or3_b32 v1, v1, v2, v3
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:84 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:96 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX906-NEXT:    v_and_or_b32 v2, v7, v63, v2
-; GFX906-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX906-NEXT:    v_or3_b32 v2, v2, v3, v5
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:104 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 24, v19
+; GFX906-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    buffer_load_dword v4, off, s[8:11], 0 offset:100 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_and_or_b32 v3, v8, v63, v3
+; GFX906-NEXT:    global_store_dwordx4 v4, v[5:8], s[2:3]
+; GFX906-NEXT:    global_store_dwordx4 v4, v[9:12], s[2:3] offset:16
+; GFX906-NEXT:    global_store_dwordx4 v4, v[13:16], s[2:3] offset:32
+; GFX906-NEXT:    global_store_dwordx4 v4, v[17:20], s[2:3] offset:48
+; GFX906-NEXT:    global_store_dwordx4 v4, v[21:24], s[2:3] offset:64
+; GFX906-NEXT:    global_store_dwordx4 v4, v[25:28], s[2:3] offset:80
+; GFX906-NEXT:    global_store_dwordx4 v4, v[29:32], s[2:3] offset:96
+; GFX906-NEXT:    global_store_dwordx4 v4, v[33:36], s[2:3] offset:112
+; GFX906-NEXT:    global_store_dwordx4 v4, v[37:40], s[2:3] offset:128
+; GFX906-NEXT:    global_store_dwordx4 v4, v[41:44], s[2:3] offset:144
+; GFX906-NEXT:    global_store_dwordx4 v4, v[45:48], s[2:3] offset:160
+; GFX906-NEXT:    global_store_dwordx4 v4, v[49:52], s[2:3] offset:176
+; GFX906-NEXT:    global_store_dwordx4 v4, v[53:56], s[2:3] offset:192
+; GFX906-NEXT:    global_store_dwordx4 v4, v[57:60], s[2:3] offset:208
+; GFX906-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:224
+; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:16 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_nop 0
+; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:20 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:24 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:28 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX906-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX906-NEXT:    v_or3_b32 v3, v3, v4, v5
-; GFX906-NEXT:    global_store_dwordx4 v61, v[0:3], s[2:3] offset:240
+; GFX906-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:240
 ; GFX906-NEXT:    s_endpgm
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-loop-carried.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-loop-carried.ll
index 95c541e2e60b7..ffc91815821a1 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-loop-carried.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-loop-carried.ll
@@ -6,47 +6,32 @@ define amdgpu_kernel void @v32i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1
 ; GFX906:       ; %bb.0: ; %entry
 ; GFX906-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
 ; GFX906-NEXT:    v_lshlrev_b32_e32 v1, 5, v0
+; GFX906-NEXT:    v_mov_b32_e32 v3, 8
+; GFX906-NEXT:    v_mov_b32_e32 v2, 0xff
 ; GFX906-NEXT:    v_cmp_le_u32_e32 vcc, 15, v0
-; GFX906-NEXT:    v_mov_b32_e32 v3, 0xff
-; GFX906-NEXT:    v_mov_b32_e32 v4, 8
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX906-NEXT:    global_load_dword v1, v1, s[2:3]
 ; GFX906-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
-; GFX906-NEXT:    v_lshrrev_b32_e32 v6, 8, v1
-; GFX906-NEXT:    v_lshrrev_b32_e32 v2, 24, v1
-; GFX906-NEXT:    v_mov_b32_e32 v5, v0
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v0, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX906-NEXT:    v_and_or_b32 v0, v1, v2, v0
+; GFX906-NEXT:    v_mov_b32_e32 v2, 24
 ; GFX906-NEXT:  .LBB0_1: ; %bb.1
 ; GFX906-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v6, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX906-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX906-NEXT:    v_and_or_b32 v6, v1, v3, v6
-; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX906-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
+; GFX906-NEXT:    v_and_b32_e32 v3, 0xff, v1
 ; GFX906-NEXT:    s_and_b64 s[4:5], exec, vcc
-; GFX906-NEXT:    v_or3_b32 v5, v6, v5, v2
+; GFX906-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX906-NEXT:    s_or_b64 s[2:3], s[4:5], s[2:3]
-; GFX906-NEXT:    v_lshrrev_b32_e32 v2, 16, v5
-; GFX906-NEXT:    v_mov_b32_e32 v6, v0
+; GFX906-NEXT:    v_or3_b32 v1, v0, v3, v1
 ; GFX906-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX906-NEXT:    s_cbranch_execnz .LBB0_1
 ; GFX906-NEXT:  ; %bb.2: ; %bb.2.loopexit
 ; GFX906-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX906-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX906-NEXT:    v_mov_b32_e32 v4, 8
-; GFX906-NEXT:    v_mov_b32_e32 v3, 0xff
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    v_and_or_b32 v0, v1, v3, v0
-; GFX906-NEXT:    v_and_b32_e32 v1, 0xff, v5
-; GFX906-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX906-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX906-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
-; GFX906-NEXT:    v_or3_b32 v0, v0, v1, v2
-; GFX906-NEXT:    v_mov_b32_e32 v1, 0
+; GFX906-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX906-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX906-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX906-NEXT:    s_endpgm
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()

>From 35c5eb531c2b6d2c732ecc61e09c4713b6381fd2 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Fri, 19 Apr 2024 11:09:29 -0700
Subject: [PATCH 10/17] remove unintentional changes

Change-Id: Idbfbbadfc1c3cee6cbd1a814b3446628dcce4394
---
 llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index a37302d3c4126..6e7d34f5adaa3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -107,7 +107,6 @@ class AMDGPUCodeGenPrepareImpl
   Module *Mod = nullptr;
   const DataLayout *DL = nullptr;
   bool HasUnsafeFPMath = false;
-  bool UsesGlobalISel = false;
   bool HasFP32DenormalFlush = false;
   bool FlowChanged = false;
   mutable Function *SqrtF32 = nullptr;
@@ -361,7 +360,6 @@ bool AMDGPUCodeGenPrepareImpl::run(Function &F) {
       Next = std::next(I);
 
       MadeChange |= visit(*I);
-      I->getType();
 
       if (Next != E) { // Control flow changed
         BasicBlock *NextInstBB = Next->getParent();
@@ -373,7 +371,6 @@ bool AMDGPUCodeGenPrepareImpl::run(Function &F) {
       }
     }
   }
-
   return MadeChange;
 }
 
@@ -2278,7 +2275,6 @@ bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) {
   Impl.ST = &TM.getSubtarget<GCNSubtarget>(F);
   Impl.AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
   Impl.UA = &getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
-  Impl.UsesGlobalISel = TM.Options.EnableGlobalISel;
   auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
   Impl.DT = DTWP ? &DTWP->getDomTree() : nullptr;
   Impl.HasUnsafeFPMath = hasUnsafeFPMath(F);
@@ -2301,7 +2297,6 @@ PreservedAnalyses AMDGPUCodeGenPreparePass::run(Function &F,
   Impl.DT = FAM.getCachedResult<DominatorTreeAnalysis>(F);
   Impl.HasUnsafeFPMath = hasUnsafeFPMath(F);
   SIModeRegisterDefaults Mode(F, *Impl.ST);
-  Impl.UsesGlobalISel = TM.Options.EnableGlobalISel;
   Impl.HasFP32DenormalFlush =
       Mode.FP32Denormals == DenormalMode::getPreserveSign();
   PreservedAnalyses PA = PreservedAnalyses::none();

>From efe24b60b11f7e1acb92689a7d5445546b40110d Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Tue, 30 Apr 2024 13:14:12 -0700
Subject: [PATCH 11/17] Review comments

Change-Id: I244784728ff1b4363ff066f8c5a6fa6d03c2a4d5
---
 .../AMDGPU/AMDGPULateCodeGenPrepare.cpp       | 203 +++++++++---------
 .../test/CodeGen/AMDGPU/vni8-across-blocks.ll |  53 ++++-
 2 files changed, 152 insertions(+), 104 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
index 822b85fac5188..d7d2ebff03b6b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
@@ -126,16 +126,17 @@ class ConversionCandidateInfo {
         NewType(NewType), ConvertBuilder(InsertBlock, InsertPt) {}
 };
 
+typedef std::pair<Instruction *, BasicBlock *> IncomingPair;
+typedef std::pair<Instruction *, SmallVector<IncomingPair, 4>> PHIUpdateInfo;
+
 class LiveRegOptimizer {
 private:
   Module *Mod = nullptr;
+  const DataLayout *DL = nullptr;
   // The scalar type to convert to
   Type *ConvertToScalar;
   // Holds the collection of PHIs with their pending new operands
-  SmallVector<std::pair<Instruction *,
-                        SmallVector<std::pair<Instruction *, BasicBlock *>, 4>>,
-              4>
-      PHIUpdater;
+  SmallVector<PHIUpdateInfo, 4> PHIUpdater;
 
 public:
   // Should the def of the instruction be converted if it is live across blocks
@@ -157,6 +158,7 @@ class LiveRegOptimizer {
   bool replacePHIs();
 
   LiveRegOptimizer(Module *Mod) : Mod(Mod) {
+    DL = &Mod->getDataLayout();
     ConvertToScalar = Type::getInt32Ty(Mod->getContext());
   }
 };
@@ -182,17 +184,18 @@ bool AMDGPULateCodeGenPrepare::runOnFunction(Function &F) {
   AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
   UA = &getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
 
-  // "Optimize" the virtual regs that cross basic block boundaries. In such
-  // cases, vectors of illegal types will be scalarized and widened, with each
-  // scalar living in its own physical register. The optimization converts the
-  // vectors to equivalent vectors of legal type (which are convereted back
+  // "Optimize" the virtual regs that cross basic block boundaries. When
+  // building the SelectionDAG, vectors of illegal types that cross basic blocks
+  // will be scalarized and widened, with each scalar living in its
+  // own physical register. To work around this, this optimization converts the
+  // vectors to equivalent vectors of legal type (which are converted back
   // before uses in subsequent blocks), to pack the bits into fewer physical
   // registers (used in CopyToReg/CopyFromReg pairs).
   LiveRegOptimizer LRO(Mod);
 
   bool Changed = false;
   for (auto &BB : F)
-    for (Instruction &I : llvm::make_early_inc_range(BB)) {
+    for (Instruction &I : make_early_inc_range(BB)) {
       Changed |= visit(I);
       if (!LRO.shouldReplaceUses(I))
         continue;
@@ -212,65 +215,59 @@ bool LiveRegOptimizer::replaceUses(Instruction &I) {
   };
   DenseMap<BasicBlock *, ConvertUseInfo> InsertedConversionMap;
 
-  ConversionCandidateInfo FromCCI(
-      &I, I.getParent(),
-      static_cast<BasicBlock::iterator>(std::next(I.getIterator())));
+  ConversionCandidateInfo FromCCI(&I, I.getParent(),
+                                  std::next(I.getIterator()));
   FromCCI.setNewType(getCompatibleType(FromCCI.getLiveRegDef()));
   for (auto IUser = I.user_begin(); IUser != I.user_end(); IUser++) {
 
-    if (Instruction *UserInst = dyn_cast<Instruction>(*IUser)) {
-      if (UserInst->getParent() != I.getParent() || isa<PHINode>(UserInst)) {
-        LLVM_DEBUG(dbgs() << *UserInst << "\n\tUses "
-                          << *FromCCI.getOriginalType()
-                          << " from previous block. Needs conversion\n");
-        convertToOptType(FromCCI);
-        if (!FromCCI.hasConverted())
-          continue;
-        // If it is a PHI node, just create and collect the new operand. We can
-        // only replace the PHI node once we have converted all the operands
-        if (auto PhiInst = dyn_cast<PHINode>(UserInst)) {
-          for (unsigned Idx = 0; Idx < PhiInst->getNumIncomingValues(); Idx++) {
-            Value *IncVal = PhiInst->getIncomingValue(Idx);
-            if (&I == dyn_cast<Instruction>(IncVal)) {
-              BasicBlock *IncBlock = PhiInst->getIncomingBlock(Idx);
-              auto PHIOps = find_if(
-                  PHIUpdater,
-                  [&UserInst](
-                      std::pair<Instruction *,
-                                SmallVector<
-                                    std::pair<Instruction *, BasicBlock *>, 4>>
-                          &Entry) { return Entry.first == UserInst; });
-
-              if (PHIOps == PHIUpdater.end())
-                PHIUpdater.push_back(
-                    {UserInst, {{FromCCI.getConverted(), IncBlock}}});
-              else
-                PHIOps->second.push_back({FromCCI.getConverted(), IncBlock});
-
-              break;
-            }
+    Instruction *UserInst = cast<Instruction>(*IUser);
+    if (UserInst->getParent() != I.getParent() || isa<PHINode>(UserInst)) {
+      LLVM_DEBUG(dbgs() << *UserInst << "\n\tUses "
+                        << *FromCCI.getOriginalType()
+                        << " from previous block. Needs conversion\n");
+      convertToOptType(FromCCI);
+      if (!FromCCI.hasConverted())
+        continue;
+      // If it is a PHI node, just create and collect the new operand. We can
+      // only replace the PHI node once we have converted all the operands
+      if (auto PHI = dyn_cast<PHINode>(UserInst)) {
+        for (unsigned Idx = 0; Idx < PHI->getNumIncomingValues(); Idx++) {
+          Value *IncVal = PHI->getIncomingValue(Idx);
+          if (&I == dyn_cast<Instruction>(IncVal)) {
+            BasicBlock *IncBlock = PHI->getIncomingBlock(Idx);
+            auto PHIOps =
+                find_if(PHIUpdater, [&UserInst](PHIUpdateInfo &Entry) {
+                  return Entry.first == UserInst;
+                });
+
+            if (PHIOps == PHIUpdater.end())
+              PHIUpdater.push_back(
+                  {UserInst, {{FromCCI.getConverted(), IncBlock}}});
+            else
+              PHIOps->second.push_back({FromCCI.getConverted(), IncBlock});
+
+            break;
           }
-          continue;
-        }
-
-        // Do not create multiple conversion sequences if there are multiple
-        // uses in the same block
-        if (InsertedConversionMap.contains(UserInst->getParent())) {
-          InsertedConversionMap[UserInst->getParent()].Users.push_back(
-              UserInst);
-          LLVM_DEBUG(dbgs() << "\tUser already has access to converted def\n");
-          continue;
         }
+        continue;
+      }
 
-        ConversionCandidateInfo ToCCI(
-            FromCCI.getConverted(), I.getType(), UserInst->getParent(),
-            static_cast<BasicBlock::iterator>(
-                UserInst->getParent()->getFirstNonPHIIt()));
-        convertFromOptType(ToCCI);
-        assert(ToCCI.hasConverted());
-        InsertedConversionMap[UserInst->getParent()] = {ToCCI.getConverted(),
-                                                        {UserInst}};
+      // Do not create multiple conversion sequences if there are multiple
+      // uses in the same block
+      if (InsertedConversionMap.contains(UserInst->getParent())) {
+        InsertedConversionMap[UserInst->getParent()].Users.push_back(UserInst);
+        LLVM_DEBUG(dbgs() << "\tUser already has access to converted def\n");
+        continue;
       }
+
+      ConversionCandidateInfo ToCCI(FromCCI.getConverted(), I.getType(),
+                                    UserInst->getParent(),
+
+                                    UserInst->getParent()->getFirstNonPHIIt());
+      convertFromOptType(ToCCI);
+      assert(ToCCI.hasConverted());
+      InsertedConversionMap[UserInst->getParent()] = {ToCCI.getConverted(),
+                                                      {UserInst}};
     }
   }
 
@@ -279,7 +276,7 @@ bool LiveRegOptimizer::replaceUses(Instruction &I) {
   for (auto &Entry : InsertedConversionMap) {
     for (auto &UserInst : Entry.second.Users) {
       LLVM_DEBUG(dbgs() << *UserInst
-                        << "\n\tNow uses: " << *Entry.second.Converted << "\n");
+                        << "\n\tNow uses: " << *Entry.second.Converted << '\n');
       UserInst->replaceUsesOfWith(&I, Entry.second.Converted);
       MadeChange = true;
     }
@@ -290,29 +287,29 @@ bool LiveRegOptimizer::replaceUses(Instruction &I) {
 bool LiveRegOptimizer::replacePHIs() {
   bool MadeChange = false;
   for (auto Ele : PHIUpdater) {
-    auto ThePHINode = cast<PHINode>(Ele.first);
-    auto NewPHINodeOps = Ele.second;
-    LLVM_DEBUG(dbgs() << "Attempting to replace: " << *ThePHINode << "\n");
+    auto [ThePHIInst, NewPHINodeOps] = Ele;
+    LLVM_DEBUG(dbgs() << "Attempting to replace: " << *ThePHIInst << '\n');
     // If we have conveted all the required operands, then do the replacement
-    if (ThePHINode->getNumIncomingValues() == NewPHINodeOps.size()) {
+    if (cast<PHINode>(ThePHIInst)->getNumIncomingValues() ==
+        NewPHINodeOps.size()) {
       IRBuilder<> Builder(Ele.first);
       auto NPHI = Builder.CreatePHI(NewPHINodeOps[0].first->getType(),
                                     NewPHINodeOps.size());
       for (auto IncVals : NewPHINodeOps) {
         NPHI->addIncoming(IncVals.first, IncVals.second);
         LLVM_DEBUG(dbgs() << "  Using: " << *IncVals.first
-                          << "  For: " << IncVals.second->getName() << "\n");
+                          << "  For: " << IncVals.second->getName() << '\n');
       }
-      LLVM_DEBUG(dbgs() << "Sucessfully replaced with " << *NPHI << "\n");
+      LLVM_DEBUG(dbgs() << "Sucessfully replaced with " << *NPHI << '\n');
       ConversionCandidateInfo ToCCI(
-          NPHI, ThePHINode->getType(), ThePHINode->getParent(),
-          static_cast<BasicBlock::iterator>(
-              ThePHINode->getParent()->getFirstNonPHIIt()));
+          NPHI, ThePHIInst->getType(), ThePHIInst->getParent(),
+
+          ThePHIInst->getParent()->getFirstNonPHIIt());
       convertFromOptType(ToCCI);
       assert(ToCCI.hasConverted());
       Ele.first->replaceAllUsesWith(ToCCI.getConverted());
       // The old PHI is no longer used
-      ThePHINode->eraseFromParent();
+      ThePHIInst->eraseFromParent();
       MadeChange = true;
     }
   }
@@ -327,8 +324,8 @@ Type *LiveRegOptimizer::getCompatibleType(Instruction *InstToConvert) {
   if (!VTy)
     return ConvertToScalar;
 
-  unsigned OriginalSize = VTy->getPrimitiveSizeInBits();
-  unsigned ConvertScalarSize = ConvertToScalar->getScalarSizeInBits();
+  TypeSize OriginalSize = DL->getTypeSizeInBits(VTy);
+  TypeSize ConvertScalarSize = DL->getTypeSizeInBits(ConvertToScalar);
   unsigned ConvertEltCount =
       (OriginalSize + ConvertScalarSize - 1) / ConvertScalarSize;
 
@@ -336,7 +333,7 @@ Type *LiveRegOptimizer::getCompatibleType(Instruction *InstToConvert) {
     return IntegerType::get(Mod->getContext(), ConvertScalarSize);
 
   return VectorType::get(Type::getIntNTy(Mod->getContext(), ConvertScalarSize),
-                         llvm::ElementCount::getFixed(ConvertEltCount));
+                         ElementCount::getFixed(ConvertEltCount));
 }
 
 void LiveRegOptimizer::convertToOptType(ConversionCandidateInfo &LR) {
@@ -348,24 +345,24 @@ void LiveRegOptimizer::convertToOptType(ConversionCandidateInfo &LR) {
   VectorType *VTy = cast<VectorType>(LR.getOriginalType());
   Type *NewTy = LR.getNewType();
 
-  unsigned OriginalSize = VTy->getPrimitiveSizeInBits();
-  unsigned NewSize = NewTy->getPrimitiveSizeInBits();
+  TypeSize OriginalSize = DL->getTypeSizeInBits(VTy);
+  TypeSize NewSize = DL->getTypeSizeInBits(NewTy);
 
   auto &Builder = LR.getConvertBuilder();
-  Value *V = static_cast<Value *>(LR.getLiveRegDef());
+  Value *V = cast<Value>(LR.getLiveRegDef());
   // If there is a bitsize match, we can fit the old vector into a new vector of
   // desired type
   if (OriginalSize == NewSize) {
-    LR.setConverted(dyn_cast<Instruction>(Builder.CreateBitCast(V, NewTy)));
+    LR.setConverted(cast<Instruction>(Builder.CreateBitCast(V, NewTy)));
     LLVM_DEBUG(dbgs() << "\tConverted def to " << *LR.getConverted()->getType()
-                      << "\n");
+                      << '\n');
     return;
   }
 
   // If there is a bitsize mismatch, we must use a wider vector
   assert(NewSize > OriginalSize);
   ElementCount ExpandedVecElementCount =
-      llvm::ElementCount::getFixed(NewSize / VTy->getScalarSizeInBits());
+      ElementCount::getFixed(NewSize / VTy->getScalarSizeInBits());
 
   SmallVector<int, 8> ShuffleMask;
   for (unsigned I = 0; I < VTy->getElementCount().getFixedValue(); I++)
@@ -375,12 +372,11 @@ void LiveRegOptimizer::convertToOptType(ConversionCandidateInfo &LR) {
        I < ExpandedVecElementCount.getFixedValue(); I++)
     ShuffleMask.push_back(VTy->getElementCount().getFixedValue());
 
-  auto ExpandedVec =
-      dyn_cast<Instruction>(Builder.CreateShuffleVector(V, ShuffleMask));
-  LR.setConverted(
-      dyn_cast<Instruction>(Builder.CreateBitCast(ExpandedVec, NewTy)));
+  Instruction *ExpandedVec =
+      cast<Instruction>(Builder.CreateShuffleVector(V, ShuffleMask));
+  LR.setConverted(cast<Instruction>(Builder.CreateBitCast(ExpandedVec, NewTy)));
   LLVM_DEBUG(dbgs() << "\tConverted def to " << *LR.getConverted()->getType()
-                    << "\n");
+                    << '\n');
   return;
 }
 
@@ -388,48 +384,49 @@ void LiveRegOptimizer::convertFromOptType(ConversionCandidateInfo &LRC) {
   Type *OTy = LRC.getOriginalType();
   VectorType *NewVTy = cast<VectorType>(LRC.getNewType());
 
-  unsigned OriginalSize = OTy->getPrimitiveSizeInBits();
-  unsigned NewSize = NewVTy->getPrimitiveSizeInBits();
+  TypeSize OriginalSize = DL->getTypeSizeInBits(OTy);
+  TypeSize NewSize = DL->getTypeSizeInBits(NewVTy);
 
   auto &Builder = LRC.getConvertBuilder();
-  Value *V = static_cast<Value *>(LRC.getLiveRegDef());
+  Value *V = cast<Value>(LRC.getLiveRegDef());
   // If there is a bitsize match, we simply convert back to the original type
   if (OriginalSize == NewSize) {
-    LRC.setConverted(dyn_cast<Instruction>(Builder.CreateBitCast(V, NewVTy)));
+    LRC.setConverted(cast<Instruction>(Builder.CreateBitCast(V, NewVTy)));
     LLVM_DEBUG(dbgs() << "\tProduced for user: " << *LRC.getConverted()
-                      << "\n");
+                      << '\n');
     return;
   }
 
   if (!OTy->isVectorTy()) {
-    auto Trunc = dyn_cast<Instruction>(Builder.CreateTrunc(
+    Instruction *Trunc = cast<Instruction>(Builder.CreateTrunc(
         LRC.getLiveRegDef(), IntegerType::get(Mod->getContext(), NewSize)));
-    auto Original = dyn_cast<Instruction>(Builder.CreateBitCast(Trunc, NewVTy));
-    LRC.setConverted(dyn_cast<Instruction>(Original));
+    Instruction *Original =
+        cast<Instruction>(Builder.CreateBitCast(Trunc, NewVTy));
+    LRC.setConverted(cast<Instruction>(Original));
     LLVM_DEBUG(dbgs() << "\tProduced for user: " << *LRC.getConverted()
-                      << "\n");
+                      << '\n');
     return;
   }
 
   // If there is a bitsize mismatch, we have used a wider vector and must strip
   // the MSBs to convert back to the original type
   assert(OriginalSize > NewSize);
-  ElementCount ExpandedVecElementCount = llvm::ElementCount::getFixed(
-      OriginalSize / NewVTy->getScalarSizeInBits());
+  ElementCount ExpandedVecElementCount =
+      ElementCount::getFixed(OriginalSize / NewVTy->getScalarSizeInBits());
   VectorType *ExpandedVT = VectorType::get(
       Type::getIntNTy(Mod->getContext(), NewVTy->getScalarSizeInBits()),
       ExpandedVecElementCount);
-  Instruction *Converted = dyn_cast<Instruction>(
-      Builder.CreateBitCast(LRC.getLiveRegDef(), ExpandedVT));
+  Instruction *Converted =
+      cast<Instruction>(Builder.CreateBitCast(LRC.getLiveRegDef(), ExpandedVT));
 
   unsigned NarrowElementCount = NewVTy->getElementCount().getFixedValue();
   SmallVector<int, 8> ShuffleMask(NarrowElementCount);
   std::iota(ShuffleMask.begin(), ShuffleMask.end(), 0);
 
-  Instruction *NarrowVec = dyn_cast<Instruction>(
-      Builder.CreateShuffleVector(Converted, ShuffleMask));
-  LRC.setConverted(dyn_cast<Instruction>(NarrowVec));
-  LLVM_DEBUG(dbgs() << "\tProduced for user: " << *LRC.getConverted() << "\n");
+  Instruction *NarrowVec =
+      cast<Instruction>(Builder.CreateShuffleVector(Converted, ShuffleMask));
+  LRC.setConverted(cast<Instruction>(NarrowVec));
+  LLVM_DEBUG(dbgs() << "\tProduced for user: " << *LRC.getConverted() << '\n');
   return;
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
index 57179f8f26aec..d08523f4d3cd0 100644
--- a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
+++ b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
@@ -350,5 +350,56 @@ bb.2:
   ret void
 }
 
-declare i32 @llvm.amdgcn.workitem.id.x()
+define amdgpu_kernel void @repeat_successor(i32 %in, ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
+; GFX906-LABEL: repeat_successor:
+; GFX906:       ; %bb.0: ; %entry
+; GFX906-NEXT:    s_load_dword s8, s[0:1], 0x24
+; GFX906-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
+; GFX906-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x3c
+; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX906-NEXT:    s_cmp_lt_i32 s8, 3
+; GFX906-NEXT:    s_cbranch_scc0 .LBB7_3
+; GFX906-NEXT:  ; %bb.1: ; %LeafBlock
+; GFX906-NEXT:    s_cmp_gt_i32 s8, 0
+; GFX906-NEXT:    s_cbranch_scc0 .LBB7_6
+; GFX906-NEXT:  ; %bb.2:
+; GFX906-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX906-NEXT:    global_load_dword v0, v0, s[4:5]
+; GFX906-NEXT:    s_branch .LBB7_5
+; GFX906-NEXT:  .LBB7_3: ; %LeafBlock5
+; GFX906-NEXT:    s_cmp_eq_u32 s8, 3
+; GFX906-NEXT:    s_cbranch_scc0 .LBB7_6
+; GFX906-NEXT:  ; %bb.4: ; %sw.bb5
+; GFX906-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX906-NEXT:    global_load_dword v0, v0, s[6:7]
+; GFX906-NEXT:  .LBB7_5: ; %return.sink.split
+; GFX906-NEXT:    v_mov_b32_e32 v1, 0
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    global_store_dword v1, v0, s[2:3]
+; GFX906-NEXT:  .LBB7_6: ; %return
+; GFX906-NEXT:    s_endpgm
+entry:
+  %idx = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
+  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
+  %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
+  %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
+  switch i32 %in, label %return [
+    i32 1, label %return.sink.split
+    i32 2, label %return.sink.split
+    i32 3, label %sw.bb5
+  ]
+
+sw.bb5:
+  br label %return.sink.split
+
+return.sink.split:
+  %tmp5 = phi <4 x i8> [ %vec2, %sw.bb5 ], [ %vec1, %entry ], [ %vec1, %entry ]
+  store <4 x i8> %tmp5, ptr addrspace(1) %dst, align 4
+  ret void
 
+return:
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x()

>From cbc7922f6141934d48ce46e07d1538814b0f0cc5 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Thu, 14 Sep 2023 12:20:06 -0700
Subject: [PATCH 12/17] Refactor the basic implementation

Change-Id: Icf1fc334974e4e1274295c594747065c50972ff7
---
 .../AMDGPU/AMDGPULateCodeGenPrepare.cpp       | 435 ++++++++----------
 .../AMDGPU/GlobalISel/vni8-across-blocks.ll   | 209 +++++++++
 .../AMDGPU/GlobalISel/vni8-loop-carried.ll    |  52 ---
 .../test/CodeGen/AMDGPU/vni8-across-blocks.ll | 150 ++++++
 llvm/test/CodeGen/AMDGPU/vni8-loop-carried.ll |  46 --
 5 files changed, 550 insertions(+), 342 deletions(-)
 delete mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-loop-carried.ll
 delete mode 100644 llvm/test/CodeGen/AMDGPU/vni8-loop-carried.ll

diff --git a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
index d7d2ebff03b6b..78bff7ffcacd6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
@@ -81,81 +81,41 @@ class AMDGPULateCodeGenPrepare
   bool visitLoadInst(LoadInst &LI);
 };
 
-class ConversionCandidateInfo {
-private:
-  // The instruction which defined the original virtual register used across
-  // blocks
-  Instruction *LiveRegDef;
-  // The original type
-  Type *OriginalType;
-  // The desired type
-  Type *NewType;
-  // The instruction sequence that converts the virtual register, to be used
-  // instead of the original
-  Instruction *Converted = nullptr;
-  // The builder used to build the conversion instruction
-  IRBuilder<> ConvertBuilder;
-
-public:
-  // The instruction which defined the original virtual register used across
-  // blocks
-  Instruction *getLiveRegDef() { return LiveRegDef; }
-  // The original type
-  Type *getOriginalType() { return OriginalType; }
-  // The desired type
-  Type *getNewType() { return NewType; }
-  void setNewType(Type *NewType) { this->NewType = NewType; }
-  // The instruction that conerts the virtual register, to be used instead of
-  // the original
-  Instruction *getConverted() { return Converted; }
-  void setConverted(Instruction *Converted) { this->Converted = Converted; }
-  // The builder used to build the conversion instruction
-  IRBuilder<> &getConvertBuilder() { return ConvertBuilder; }
-  // Do we have a instruction sequence which convert the original virtual
-  // register
-  bool hasConverted() { return Converted != nullptr; }
-
-  ConversionCandidateInfo(Instruction *LiveRegDef, BasicBlock *InsertBlock,
-                          BasicBlock::iterator InsertPt)
-      : LiveRegDef(LiveRegDef), OriginalType(LiveRegDef->getType()),
-        ConvertBuilder(InsertBlock, InsertPt) {}
-  ConversionCandidateInfo(Instruction *LiveRegDef, Type *NewType,
-                          BasicBlock *InsertBlock,
-                          BasicBlock::iterator InsertPt)
-      : LiveRegDef(LiveRegDef), OriginalType(LiveRegDef->getType()),
-        NewType(NewType), ConvertBuilder(InsertBlock, InsertPt) {}
-};
-
-typedef std::pair<Instruction *, BasicBlock *> IncomingPair;
-typedef std::pair<Instruction *, SmallVector<IncomingPair, 4>> PHIUpdateInfo;
+using ValueToValueMap = DenseMap<const Value *, Value *>;
 
 class LiveRegOptimizer {
 private:
   Module *Mod = nullptr;
   const DataLayout *DL = nullptr;
-  // The scalar type to convert to
+  /// The scalar type to convert to
   Type *ConvertToScalar;
-  // Holds the collection of PHIs with their pending new operands
-  SmallVector<PHIUpdateInfo, 4> PHIUpdater;
+  /// The set of visited Instructions
+  SmallPtrSet<Instruction *, 4> Visited;
+  /// The set of Instructions to be deleted
+  SmallPtrSet<Instruction *, 4> DeadInstrs;
+  /// Map of Value -> Converted Value
+  ValueToValueMap ValMap;
 
 public:
-  // Should the def of the instruction be converted if it is live across blocks
-  bool shouldReplaceUses(const Instruction &I);
-  // Convert the virtual register to the compatible vector of legal type
-  void convertToOptType(ConversionCandidateInfo &LR);
-  // Convert the virtual register back to the original type, stripping away
-  // the MSBs in cases where there was an imperfect fit (e.g. v2i32 -> v7i8)
-  void convertFromOptType(ConversionCandidateInfo &LR);
-  // Get a vector of desired scalar type that is compatible with the original
-  // vector. In cases where there is no bitsize equivalent using a legal vector
-  // type, we pad the MSBs (e.g. v7i8 -> v2i32)
-  Type *getCompatibleType(Instruction *InstToConvert);
-  // Find and replace uses of the virtual register in different block with a
-  // newly produced virtual register of legal type
-  bool replaceUses(Instruction &I);
-  // Replace the collected PHIs with newly produced incoming values. Replacement
-  // is only done if we have a replacement for each original incoming value.
-  bool replacePHIs();
+  /// Calculate the and \p return  the type to convert to given a problematic \p
+  /// OriginalType. In some instances, we may widen the type (e.g. v2i8 -> i32).
+  Type *calculateConvertType(Type *OriginalType);
+  /// Convert the virtual register defined by \p V to the compatible vector of
+  /// legal type
+  Value *convertToOptType(Instruction *V, BasicBlock::iterator &InstPt);
+  /// Convert the virtual register defined by \p V back to the original type \p
+  /// ConvertType, stripping away the MSBs in cases where there was an imperfect
+  /// fit (e.g. v2i32 -> v7i8)
+  Value *convertFromOptType(Type *ConvertType, Instruction *V,
+                            BasicBlock::iterator &InstPt,
+                            BasicBlock *InsertBlock);
+  /// Check for problematic PHI nodes or cross-bb values based on the value
+  /// defined by \p I, and coerce to legal types if necessary. For problematic
+  /// PHI node, we coerce all incoming values in a single invocation.
+  bool optimizeLiveType(Instruction *I);
+
+  /// Remove all instructions that have become dead (i.e. all the re-typed PHIs)
+  void removeDeadInstrs();
 
   LiveRegOptimizer(Module *Mod) : Mod(Mod) {
     DL = &Mod->getDataLayout();
@@ -187,142 +147,31 @@ bool AMDGPULateCodeGenPrepare::runOnFunction(Function &F) {
   // "Optimize" the virtual regs that cross basic block boundaries. When
   // building the SelectionDAG, vectors of illegal types that cross basic blocks
   // will be scalarized and widened, with each scalar living in its
-  // own physical register. To work around this, this optimization converts the
+  // own register. To work around this, this optimization converts the
   // vectors to equivalent vectors of legal type (which are converted back
   // before uses in subsequent blocks), to pack the bits into fewer physical
   // registers (used in CopyToReg/CopyFromReg pairs).
   LiveRegOptimizer LRO(Mod);
 
   bool Changed = false;
+
   for (auto &BB : F)
     for (Instruction &I : make_early_inc_range(BB)) {
       Changed |= visit(I);
-      if (!LRO.shouldReplaceUses(I))
-        continue;
-      Changed |= LRO.replaceUses(I);
+      Changed |= LRO.optimizeLiveType(&I);
     }
 
-  Changed |= LRO.replacePHIs();
+  LRO.removeDeadInstrs();
   return Changed;
 }
 
-bool LiveRegOptimizer::replaceUses(Instruction &I) {
-  bool MadeChange = false;
-
-  struct ConvertUseInfo {
-    Instruction *Converted;
-    SmallVector<Instruction *, 4> Users;
-  };
-  DenseMap<BasicBlock *, ConvertUseInfo> InsertedConversionMap;
-
-  ConversionCandidateInfo FromCCI(&I, I.getParent(),
-                                  std::next(I.getIterator()));
-  FromCCI.setNewType(getCompatibleType(FromCCI.getLiveRegDef()));
-  for (auto IUser = I.user_begin(); IUser != I.user_end(); IUser++) {
-
-    Instruction *UserInst = cast<Instruction>(*IUser);
-    if (UserInst->getParent() != I.getParent() || isa<PHINode>(UserInst)) {
-      LLVM_DEBUG(dbgs() << *UserInst << "\n\tUses "
-                        << *FromCCI.getOriginalType()
-                        << " from previous block. Needs conversion\n");
-      convertToOptType(FromCCI);
-      if (!FromCCI.hasConverted())
-        continue;
-      // If it is a PHI node, just create and collect the new operand. We can
-      // only replace the PHI node once we have converted all the operands
-      if (auto PHI = dyn_cast<PHINode>(UserInst)) {
-        for (unsigned Idx = 0; Idx < PHI->getNumIncomingValues(); Idx++) {
-          Value *IncVal = PHI->getIncomingValue(Idx);
-          if (&I == dyn_cast<Instruction>(IncVal)) {
-            BasicBlock *IncBlock = PHI->getIncomingBlock(Idx);
-            auto PHIOps =
-                find_if(PHIUpdater, [&UserInst](PHIUpdateInfo &Entry) {
-                  return Entry.first == UserInst;
-                });
-
-            if (PHIOps == PHIUpdater.end())
-              PHIUpdater.push_back(
-                  {UserInst, {{FromCCI.getConverted(), IncBlock}}});
-            else
-              PHIOps->second.push_back({FromCCI.getConverted(), IncBlock});
-
-            break;
-          }
-        }
-        continue;
-      }
-
-      // Do not create multiple conversion sequences if there are multiple
-      // uses in the same block
-      if (InsertedConversionMap.contains(UserInst->getParent())) {
-        InsertedConversionMap[UserInst->getParent()].Users.push_back(UserInst);
-        LLVM_DEBUG(dbgs() << "\tUser already has access to converted def\n");
-        continue;
-      }
-
-      ConversionCandidateInfo ToCCI(FromCCI.getConverted(), I.getType(),
-                                    UserInst->getParent(),
-
-                                    UserInst->getParent()->getFirstNonPHIIt());
-      convertFromOptType(ToCCI);
-      assert(ToCCI.hasConverted());
-      InsertedConversionMap[UserInst->getParent()] = {ToCCI.getConverted(),
-                                                      {UserInst}};
-    }
-  }
-
-  // Replace uses of with in a separate loop that is not dependent upon the
-  // state of the uses
-  for (auto &Entry : InsertedConversionMap) {
-    for (auto &UserInst : Entry.second.Users) {
-      LLVM_DEBUG(dbgs() << *UserInst
-                        << "\n\tNow uses: " << *Entry.second.Converted << '\n');
-      UserInst->replaceUsesOfWith(&I, Entry.second.Converted);
-      MadeChange = true;
-    }
-  }
-  return MadeChange;
-}
-
-bool LiveRegOptimizer::replacePHIs() {
-  bool MadeChange = false;
-  for (auto Ele : PHIUpdater) {
-    auto [ThePHIInst, NewPHINodeOps] = Ele;
-    LLVM_DEBUG(dbgs() << "Attempting to replace: " << *ThePHIInst << '\n');
-    // If we have conveted all the required operands, then do the replacement
-    if (cast<PHINode>(ThePHIInst)->getNumIncomingValues() ==
-        NewPHINodeOps.size()) {
-      IRBuilder<> Builder(Ele.first);
-      auto NPHI = Builder.CreatePHI(NewPHINodeOps[0].first->getType(),
-                                    NewPHINodeOps.size());
-      for (auto IncVals : NewPHINodeOps) {
-        NPHI->addIncoming(IncVals.first, IncVals.second);
-        LLVM_DEBUG(dbgs() << "  Using: " << *IncVals.first
-                          << "  For: " << IncVals.second->getName() << '\n');
-      }
-      LLVM_DEBUG(dbgs() << "Sucessfully replaced with " << *NPHI << '\n');
-      ConversionCandidateInfo ToCCI(
-          NPHI, ThePHIInst->getType(), ThePHIInst->getParent(),
-
-          ThePHIInst->getParent()->getFirstNonPHIIt());
-      convertFromOptType(ToCCI);
-      assert(ToCCI.hasConverted());
-      Ele.first->replaceAllUsesWith(ToCCI.getConverted());
-      // The old PHI is no longer used
-      ThePHIInst->eraseFromParent();
-      MadeChange = true;
-    }
-  }
-  return MadeChange;
-}
-
-Type *LiveRegOptimizer::getCompatibleType(Instruction *InstToConvert) {
-  Type *OriginalType = InstToConvert->getType();
+Type *LiveRegOptimizer::calculateConvertType(Type *OriginalType) {
   assert(OriginalType->getScalarSizeInBits() <=
          ConvertToScalar->getScalarSizeInBits());
-  VectorType *VTy = dyn_cast<VectorType>(OriginalType);
+
+  FixedVectorType *VTy = dyn_cast<FixedVectorType>(OriginalType);
   if (!VTy)
-    return ConvertToScalar;
+    return nullptr;
 
   TypeSize OriginalSize = DL->getTypeSizeInBits(VTy);
   TypeSize ConvertScalarSize = DL->getTypeSizeInBits(ConvertToScalar);
@@ -336,30 +185,24 @@ Type *LiveRegOptimizer::getCompatibleType(Instruction *InstToConvert) {
                          ElementCount::getFixed(ConvertEltCount));
 }
 
-void LiveRegOptimizer::convertToOptType(ConversionCandidateInfo &LR) {
-  if (LR.hasConverted()) {
-    LLVM_DEBUG(dbgs() << "\tAlready has converted def\n");
-    return;
-  }
-
-  VectorType *VTy = cast<VectorType>(LR.getOriginalType());
-  Type *NewTy = LR.getNewType();
+Value *LiveRegOptimizer::convertToOptType(Instruction *V,
+                                          BasicBlock::iterator &InsertPt) {
+  VectorType *VTy = cast<VectorType>(V->getType());
+  Type *NewTy = calculateConvertType(V->getType());
+  if (!NewTy)
+    return nullptr;
 
   TypeSize OriginalSize = DL->getTypeSizeInBits(VTy);
   TypeSize NewSize = DL->getTypeSizeInBits(NewTy);
 
-  auto &Builder = LR.getConvertBuilder();
-  Value *V = cast<Value>(LR.getLiveRegDef());
+  IRBuilder<> Builder(V->getParent(), InsertPt);
   // If there is a bitsize match, we can fit the old vector into a new vector of
-  // desired type
-  if (OriginalSize == NewSize) {
-    LR.setConverted(cast<Instruction>(Builder.CreateBitCast(V, NewTy)));
-    LLVM_DEBUG(dbgs() << "\tConverted def to " << *LR.getConverted()->getType()
-                      << '\n');
-    return;
-  }
+  // desired type.
+  if (OriginalSize == NewSize)
+    return cast<Instruction>(
+        Builder.CreateBitCast(V, NewTy, V->getName() + ".bc"));
 
-  // If there is a bitsize mismatch, we must use a wider vector
+  // If there is a bitsize mismatch, we must use a wider vector.
   assert(NewSize > OriginalSize);
   ElementCount ExpandedVecElementCount =
       ElementCount::getFixed(NewSize / VTy->getScalarSizeInBits());
@@ -374,69 +217,173 @@ void LiveRegOptimizer::convertToOptType(ConversionCandidateInfo &LR) {
 
   Instruction *ExpandedVec =
       cast<Instruction>(Builder.CreateShuffleVector(V, ShuffleMask));
-  LR.setConverted(cast<Instruction>(Builder.CreateBitCast(ExpandedVec, NewTy)));
-  LLVM_DEBUG(dbgs() << "\tConverted def to " << *LR.getConverted()->getType()
-                    << '\n');
-  return;
+  return cast<Instruction>(
+      Builder.CreateBitCast(ExpandedVec, NewTy, V->getName() + ".bc"));
 }
 
-void LiveRegOptimizer::convertFromOptType(ConversionCandidateInfo &LRC) {
-  Type *OTy = LRC.getOriginalType();
-  VectorType *NewVTy = cast<VectorType>(LRC.getNewType());
+Value *LiveRegOptimizer::convertFromOptType(Type *ConvertType, Instruction *V,
+                                            BasicBlock::iterator &InsertPt,
+                                            BasicBlock *InsertBB) {
+  VectorType *NewVTy = cast<VectorType>(ConvertType);
 
-  TypeSize OriginalSize = DL->getTypeSizeInBits(OTy);
+  TypeSize OriginalSize = DL->getTypeSizeInBits(V->getType());
   TypeSize NewSize = DL->getTypeSizeInBits(NewVTy);
 
-  auto &Builder = LRC.getConvertBuilder();
-  Value *V = cast<Value>(LRC.getLiveRegDef());
-  // If there is a bitsize match, we simply convert back to the original type
+  IRBuilder<> Builder(InsertBB, InsertPt);
+  // If there is a bitsize match, we simply convert back to the original type.
   if (OriginalSize == NewSize) {
-    LRC.setConverted(cast<Instruction>(Builder.CreateBitCast(V, NewVTy)));
-    LLVM_DEBUG(dbgs() << "\tProduced for user: " << *LRC.getConverted()
-                      << '\n');
-    return;
+    return cast<Instruction>(
+        Builder.CreateBitCast(V, NewVTy, V->getName() + ".bc"));
   }
 
-  if (!OTy->isVectorTy()) {
-    Instruction *Trunc = cast<Instruction>(Builder.CreateTrunc(
-        LRC.getLiveRegDef(), IntegerType::get(Mod->getContext(), NewSize)));
-    Instruction *Original =
-        cast<Instruction>(Builder.CreateBitCast(Trunc, NewVTy));
-    LRC.setConverted(cast<Instruction>(Original));
-    LLVM_DEBUG(dbgs() << "\tProduced for user: " << *LRC.getConverted()
-                      << '\n');
-    return;
+  // If there is a bitsize mismatch, then we must have used a wider value to
+  // hold the bits.
+  assert(OriginalSize > NewSize);
+  // For wide scalars, we can just truncate the value.
+  if (!V->getType()->isVectorTy()) {
+    Instruction *Trunc = cast<Instruction>(
+        Builder.CreateTrunc(V, IntegerType::get(Mod->getContext(), NewSize)));
+    return cast<Instruction>(Builder.CreateBitCast(Trunc, NewVTy));
   }
 
-  // If there is a bitsize mismatch, we have used a wider vector and must strip
-  // the MSBs to convert back to the original type
-  assert(OriginalSize > NewSize);
+  // For wider vectors, we must strip the MSBs to convert back to the original
+  // type.
   ElementCount ExpandedVecElementCount =
       ElementCount::getFixed(OriginalSize / NewVTy->getScalarSizeInBits());
   VectorType *ExpandedVT = VectorType::get(
       Type::getIntNTy(Mod->getContext(), NewVTy->getScalarSizeInBits()),
       ExpandedVecElementCount);
   Instruction *Converted =
-      cast<Instruction>(Builder.CreateBitCast(LRC.getLiveRegDef(), ExpandedVT));
+      cast<Instruction>(Builder.CreateBitCast(V, ExpandedVT));
 
   unsigned NarrowElementCount = NewVTy->getElementCount().getFixedValue();
   SmallVector<int, 8> ShuffleMask(NarrowElementCount);
   std::iota(ShuffleMask.begin(), ShuffleMask.end(), 0);
 
-  Instruction *NarrowVec =
-      cast<Instruction>(Builder.CreateShuffleVector(Converted, ShuffleMask));
-  LRC.setConverted(cast<Instruction>(NarrowVec));
-  LLVM_DEBUG(dbgs() << "\tProduced for user: " << *LRC.getConverted() << '\n');
-  return;
+  return cast<Instruction>(Builder.CreateShuffleVector(Converted, ShuffleMask));
+}
+
+bool LiveRegOptimizer::optimizeLiveType(Instruction *I) {
+  SmallVector<Instruction *, 4> Worklist;
+  SmallPtrSet<PHINode *, 4> PhiNodes;
+  SmallPtrSet<Instruction *, 4> Defs;
+  SmallPtrSet<Instruction *, 4> Uses;
+
+  Worklist.push_back(cast<Instruction>(I));
+  while (!Worklist.empty()) {
+    Instruction *II = Worklist.pop_back_val();
+    if (Visited.count(II))
+      continue;
+    Visited.insert(II);
+
+    Type *ITy = II->getType();
+    // Only vectors of illegal type will be scalarized when building the
+    // selection DAG.
+    bool ShouldReplace = ITy->isVectorTy() && ITy->getScalarSizeInBits() < 16 &&
+                         !ITy->getScalarType()->isPointerTy();
+
+    if (!ShouldReplace)
+      continue;
+
+    if (auto *Phi = dyn_cast<PHINode>(II)) {
+      PhiNodes.insert(Phi);
+      // Collect all the incoming values of problematic PHI nodes.
+      for (Value *V : Phi->incoming_values()) {
+        // Repeat the collection process for newly found PHI nodes.
+        if (auto *OpPhi = dyn_cast<PHINode>(V)) {
+          if (!PhiNodes.count(OpPhi) && !Visited.count(OpPhi))
+            Worklist.push_back(OpPhi);
+          continue;
+        }
+
+        auto IncInst = dyn_cast<Instruction>(V);
+        if (!IncInst)
+          return false;
+
+        // Collect all other incoming values for coercion.
+        Defs.insert(IncInst);
+      }
+    }
+
+    // Collect all relevant uses.
+    for (User *V : II->users()) {
+      // Repeat the collection process for problematic PHI nodes.
+      if (auto *OpPhi = dyn_cast<PHINode>(V)) {
+        if (!PhiNodes.count(OpPhi) && !Visited.count(OpPhi))
+          Worklist.push_back(OpPhi);
+        continue;
+      }
+
+      auto UseInst = cast<Instruction>(V);
+      // Collect all uses of PHINodes and any use the crosses BB boundaries.
+      if (UseInst->getParent() != II->getParent() || isa<PHINode>(II)) {
+        Uses.insert(UseInst);
+        if (!Defs.count(II) && !isa<PHINode>(II)) {
+          Defs.insert(II);
+        }
+      }
+    }
+  }
+
+  // Coerce and track the defs.
+  for (Instruction *D : Defs) {
+    if (!ValMap.contains(D)) {
+      BasicBlock::iterator InsertPt = std::next(D->getIterator());
+      Value *ConvertVal = convertToOptType(D, InsertPt);
+      assert(ConvertVal);
+      ValMap[D] = ConvertVal;
+    }
+  }
+
+  // Construct new-typed PHI nodes.
+  for (PHINode *Phi : PhiNodes) {
+    assert(calculateConvertType(Phi->getType()));
+    ValMap[Phi] = PHINode::Create(calculateConvertType(Phi->getType()),
+                                  Phi->getNumIncomingValues(),
+                                  Phi->getName() + ".tc", Phi->getIterator());
+  }
+  // Connect all the PHI nodes with their new incoming values.
+  for (PHINode *Phi : PhiNodes) {
+    PHINode *NewPhi = cast<PHINode>(ValMap[Phi]);
+    for (int I = 0, E = Phi->getNumIncomingValues(); I < E; I++)
+      NewPhi->addIncoming(ValMap[Phi->getIncomingValue(I)],
+                          Phi->getIncomingBlock(I));
+    Visited.insert(NewPhi);
+  }
+  // Coerce back to the original type and replace the uses.
+  for (Instruction *U : Uses) {
+    // Replace all converted operands for a use.
+    for (auto [OpIdx, Op] : enumerate(U->operands())) {
+      if (ValMap.contains(Op)) {
+        Value *NewVal = nullptr;
+        if (ValMap.contains(ValMap[Op]))
+          NewVal = ValMap[Op];
+        else {
+          BasicBlock::iterator InsertPt = U->getIterator();
+          NewVal =
+              convertFromOptType(Op->getType(), cast<Instruction>(ValMap[Op]),
+                                 InsertPt, U->getParent());
+          ValMap[ValMap[Op]] = NewVal;
+        }
+        assert(NewVal);
+        U->setOperand(OpIdx, NewVal);
+      }
+    }
+  }
+
+  // Save the removed phis to be deleted later.
+  for (PHINode *Phi : PhiNodes) {
+    DeadInstrs.insert(Phi);
+  }
+  return true;
 }
 
-bool LiveRegOptimizer::shouldReplaceUses(const Instruction &I) {
-  // Vectors of illegal types are copied across blocks in an efficient manner.
-  // They are scalarized and widened to legal scalars. In such cases, we can do
-  // better by using legal vector types
-  Type *IType = I.getType();
-  return IType->isVectorTy() && IType->getScalarSizeInBits() < 16 &&
-         !I.getType()->getScalarType()->isPointerTy();
+void LiveRegOptimizer::removeDeadInstrs() {
+  // Remove instrs that have been marked dead after type-coercion.
+  for (auto *I : DeadInstrs) {
+    I->replaceAllUsesWith(PoisonValue::get(I->getType()));
+    I->eraseFromParent();
+  }
 }
 
 bool AMDGPULateCodeGenPrepare::canWidenScalarExtLoad(LoadInst &LI) const {
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll
index 3def10e73717b..83cb92210ec84 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll
@@ -423,5 +423,214 @@ bb.2:
   ret void
 }
 
+
+define amdgpu_kernel void @repeat_successor(i32 %in, ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
+; GFX906-LABEL: repeat_successor:
+; GFX906:       ; %bb.0: ; %entry
+; GFX906-NEXT:    s_load_dword s2, s[0:1], 0x24
+; GFX906-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
+; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX906-NEXT:    s_cmp_lt_i32 s2, 3
+; GFX906-NEXT:    s_cbranch_scc0 .LBB7_3
+; GFX906-NEXT:  ; %bb.1: ; %LeafBlock
+; GFX906-NEXT:    s_cmp_ge_i32 s2, 1
+; GFX906-NEXT:    s_cbranch_scc0 .LBB7_6
+; GFX906-NEXT:  ; %bb.2:
+; GFX906-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX906-NEXT:    global_load_dword v0, v0, s[4:5]
+; GFX906-NEXT:    s_branch .LBB7_5
+; GFX906-NEXT:  .LBB7_3: ; %LeafBlock5
+; GFX906-NEXT:    s_cmp_eq_u32 s2, 3
+; GFX906-NEXT:    s_cbranch_scc0 .LBB7_6
+; GFX906-NEXT:  ; %bb.4: ; %sw.bb5
+; GFX906-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX906-NEXT:    global_load_dword v0, v0, s[6:7]
+; GFX906-NEXT:  .LBB7_5: ; %return.sink.split
+; GFX906-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x3c
+; GFX906-NEXT:    v_mov_b32_e32 v1, 0
+; GFX906-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX906-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX906-NEXT:  .LBB7_6: ; %return
+; GFX906-NEXT:    s_endpgm
+entry:
+  %idx = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
+  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
+  %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
+  %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
+  switch i32 %in, label %return [
+    i32 1, label %return.sink.split
+    i32 2, label %return.sink.split
+    i32 3, label %sw.bb5
+  ]
+
+sw.bb5:
+  br label %return.sink.split
+
+return.sink.split:
+  %tmp5 = phi <4 x i8> [ %vec2, %sw.bb5 ], [ %vec1, %entry ], [ %vec1, %entry ]
+  store <4 x i8> %tmp5, ptr addrspace(1) %dst, align 4
+  ret void
+
+return:
+  ret void
+}
+
+define amdgpu_kernel void @v8i8_phi_chain(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst0, ptr addrspace(1) nocapture %dst1) {
+; GFX906-LABEL: v8i8_phi_chain:
+; GFX906:       ; %bb.0: ; %entry
+; GFX906-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX906-NEXT:    v_lshlrev_b32_e32 v3, 3, v0
+; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v0
+; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX906-NEXT:    global_load_dwordx2 v[1:2], v3, s[0:1]
+; GFX906-NEXT:    s_xor_b64 s[0:1], vcc, -1
+; GFX906-NEXT:    s_and_saveexec_b64 s[8:9], vcc
+; GFX906-NEXT:    s_cbranch_execz .LBB8_2
+; GFX906-NEXT:  ; %bb.1: ; %bb.1
+; GFX906-NEXT:    global_load_dwordx2 v[1:2], v3, s[2:3]
+; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 7, v0
+; GFX906-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
+; GFX906-NEXT:    s_and_b64 s[2:3], exec, vcc
+; GFX906-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
+; GFX906-NEXT:  .LBB8_2: ; %Flow
+; GFX906-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX906-NEXT:    s_and_saveexec_b64 s[2:3], s[0:1]
+; GFX906-NEXT:    s_cbranch_execz .LBB8_4
+; GFX906-NEXT:  ; %bb.3: ; %bb.2
+; GFX906-NEXT:    v_mov_b32_e32 v0, 0
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    global_store_dwordx2 v0, v[1:2], s[4:5]
+; GFX906-NEXT:  .LBB8_4: ; %bb.3
+; GFX906-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX906-NEXT:    v_mov_b32_e32 v0, 0
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    global_store_dwordx2 v0, v[1:2], s[6:7]
+; GFX906-NEXT:    s_endpgm
+entry:
+  %idx = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx
+  %vec1 = load <8 x i8>, ptr addrspace(1) %gep1
+  %gep2 = getelementptr <8 x i8>, ptr addrspace(1) %src2, i32 %idx
+  %vec2 = load <8 x i8>, ptr addrspace(1) %gep2
+  %cmp = icmp ult i32 %idx, 15
+  br i1 %cmp, label %bb.1, label %bb.2
+bb.1:
+  %cmp2 = icmp ult i32 %idx, 7
+  br i1 %cmp2, label %bb.2, label %bb.3
+
+bb.2:
+  %tmp5 = phi <8 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ]
+  store <8 x i8> %tmp5, ptr addrspace(1) %dst0, align 4
+  br label %bb.3
+
+bb.3:
+  %tmp7 = phi <8 x i8> [ %vec2, %bb.1], [%tmp5, %bb.2]
+  store <8 x i8> %tmp7, ptr addrspace(1) %dst1, align 4
+  ret void
+}
+
+define amdgpu_kernel void @v8i8_multi_block(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst0, ptr addrspace(1) nocapture %dst1) {
+; GFX906-LABEL: v8i8_multi_block:
+; GFX906:       ; %bb.0: ; %entry
+; GFX906-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 3, v0
+; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v0
+; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX906-NEXT:    global_load_dwordx2 v[3:4], v5, s[0:1]
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_mov_b32_e32 v1, v3
+; GFX906-NEXT:    v_mov_b32_e32 v2, v4
+; GFX906-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX906-NEXT:    s_cbranch_execz .LBB9_4
+; GFX906-NEXT:  ; %bb.1: ; %bb.1
+; GFX906-NEXT:    global_load_dwordx2 v[1:2], v5, s[2:3]
+; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 7, v0
+; GFX906-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX906-NEXT:    s_cbranch_execz .LBB9_3
+; GFX906-NEXT:  ; %bb.2: ; %bb.2
+; GFX906-NEXT:    v_mov_b32_e32 v0, 0
+; GFX906-NEXT:    global_store_dwordx2 v0, v[3:4], s[4:5]
+; GFX906-NEXT:  .LBB9_3: ; %Flow
+; GFX906-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX906-NEXT:  .LBB9_4: ; %bb.3
+; GFX906-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX906-NEXT:    v_mov_b32_e32 v0, 0
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    global_store_dwordx2 v0, v[1:2], s[6:7]
+; GFX906-NEXT:    s_endpgm
+entry:
+  %idx = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx
+  %vec1 = load <8 x i8>, ptr addrspace(1) %gep1
+  %gep2 = getelementptr <8 x i8>, ptr addrspace(1) %src2, i32 %idx
+  %vec2 = load <8 x i8>, ptr addrspace(1) %gep2
+  %cmp = icmp ult i32 %idx, 15
+  br i1 %cmp, label %bb.1, label %bb.3
+bb.1:
+  %cmp2 = icmp ult i32 %idx, 7
+  br i1 %cmp2, label %bb.2, label %bb.3
+
+bb.2:
+  store <8 x i8> %vec1, ptr addrspace(1) %dst0, align 4
+  br label %bb.3
+
+bb.3:
+  %tmp5 = phi <8 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ], [ %vec2, %bb.2]
+  store <8 x i8> %tmp5, ptr addrspace(1) %dst1, align 4
+  ret void
+}
+
+define amdgpu_kernel void @v32i8_loop_carried(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
+; GFX906-LABEL: v32i8_loop_carried:
+; GFX906:       ; %bb.0: ; %entry
+; GFX906-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX906-NEXT:    v_lshlrev_b32_e32 v1, 5, v0
+; GFX906-NEXT:    v_mov_b32_e32 v3, 8
+; GFX906-NEXT:    v_mov_b32_e32 v2, 0xff
+; GFX906-NEXT:    v_cmp_le_u32_e32 vcc, 15, v0
+; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX906-NEXT:    global_load_dword v1, v1, s[2:3]
+; GFX906-NEXT:    s_mov_b64 s[2:3], 0
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v0, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX906-NEXT:    v_and_or_b32 v0, v1, v2, v0
+; GFX906-NEXT:    v_mov_b32_e32 v2, 24
+; GFX906-NEXT:  .LBB10_1: ; %bb.1
+; GFX906-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX906-NEXT:    v_and_b32_e32 v3, 0xff, v1
+; GFX906-NEXT:    s_and_b64 s[4:5], exec, vcc
+; GFX906-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX906-NEXT:    s_or_b64 s[2:3], s[4:5], s[2:3]
+; GFX906-NEXT:    v_or3_b32 v1, v0, v3, v1
+; GFX906-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX906-NEXT:    s_cbranch_execnz .LBB10_1
+; GFX906-NEXT:  ; %bb.2: ; %bb.2.loopexit
+; GFX906-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX906-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX906-NEXT:    v_mov_b32_e32 v0, 0
+; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX906-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX906-NEXT:    s_endpgm
+entry:
+  %idx = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep1 = getelementptr <32 x i8>, ptr addrspace(1) %src1, i32 %idx
+  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
+  br label %bb.1
+
+bb.1:
+  %temp = phi <4 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ]
+  %vec2 = shufflevector <4 x i8> %vec1, <4 x i8> %temp, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %cmp = icmp ult i32 %idx, 15
+  br i1 %cmp, label %bb.1, label %bb.2
+  br label %bb.2
+
+bb.2:
+  store <4 x i8> %vec2, ptr addrspace(1) %dst, align 4
+  ret void
+}
+
+
 declare i32 @llvm.amdgcn.workitem.id.x()
 
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-loop-carried.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-loop-carried.ll
deleted file mode 100644
index ffc91815821a1..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-loop-carried.ll
+++ /dev/null
@@ -1,52 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx906 < %s | FileCheck --check-prefix=GFX906 %s
-
-define amdgpu_kernel void @v32i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
-; GFX906-LABEL: v32i8_liveout:
-; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX906-NEXT:    v_lshlrev_b32_e32 v1, 5, v0
-; GFX906-NEXT:    v_mov_b32_e32 v3, 8
-; GFX906-NEXT:    v_mov_b32_e32 v2, 0xff
-; GFX906-NEXT:    v_cmp_le_u32_e32 vcc, 15, v0
-; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX906-NEXT:    global_load_dword v1, v1, s[2:3]
-; GFX906-NEXT:    s_mov_b64 s[2:3], 0
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v0, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX906-NEXT:    v_and_or_b32 v0, v1, v2, v0
-; GFX906-NEXT:    v_mov_b32_e32 v2, 24
-; GFX906-NEXT:  .LBB0_1: ; %bb.1
-; GFX906-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX906-NEXT:    v_and_b32_e32 v3, 0xff, v1
-; GFX906-NEXT:    s_and_b64 s[4:5], exec, vcc
-; GFX906-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX906-NEXT:    s_or_b64 s[2:3], s[4:5], s[2:3]
-; GFX906-NEXT:    v_or3_b32 v1, v0, v3, v1
-; GFX906-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX906-NEXT:    s_cbranch_execnz .LBB0_1
-; GFX906-NEXT:  ; %bb.2: ; %bb.2.loopexit
-; GFX906-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX906-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX906-NEXT:    v_mov_b32_e32 v0, 0
-; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX906-NEXT:    global_store_dword v0, v1, s[0:1]
-; GFX906-NEXT:    s_endpgm
-entry:
-  %idx = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep1 = getelementptr <32 x i8>, ptr addrspace(1) %src1, i32 %idx
-  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
-  br label %bb.1
-
-bb.1:
-  %temp = phi <4 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ]
-  %vec2 = shufflevector <4 x i8> %vec1, <4 x i8> %temp, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-  %cmp = icmp ult i32 %idx, 15
-  br i1 %cmp, label %bb.1, label %bb.2
-  br label %bb.2
-
-bb.2:
-  store <4 x i8> %vec2, ptr addrspace(1) %dst, align 4
-  ret void
-}
diff --git a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
index d08523f4d3cd0..6ee4df95308b9 100644
--- a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
+++ b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
@@ -402,4 +402,154 @@ return:
   ret void
 }
 
+define amdgpu_kernel void @v8i8_phi_chain(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst0, ptr addrspace(1) nocapture %dst1) {
+; GFX906-LABEL: v8i8_phi_chain:
+; GFX906:       ; %bb.0: ; %entry
+; GFX906-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX906-NEXT:    v_lshlrev_b32_e32 v3, 3, v0
+; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v0
+; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX906-NEXT:    global_load_dwordx2 v[1:2], v3, s[0:1]
+; GFX906-NEXT:    v_cmp_lt_u32_e64 s[0:1], 14, v0
+; GFX906-NEXT:    s_and_saveexec_b64 s[8:9], vcc
+; GFX906-NEXT:    s_cbranch_execz .LBB8_2
+; GFX906-NEXT:  ; %bb.1: ; %bb.1
+; GFX906-NEXT:    global_load_dwordx2 v[1:2], v3, s[2:3]
+; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 7, v0
+; GFX906-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
+; GFX906-NEXT:    s_and_b64 s[2:3], vcc, exec
+; GFX906-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
+; GFX906-NEXT:  .LBB8_2: ; %Flow
+; GFX906-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX906-NEXT:    s_and_saveexec_b64 s[2:3], s[0:1]
+; GFX906-NEXT:    s_cbranch_execz .LBB8_4
+; GFX906-NEXT:  ; %bb.3: ; %bb.2
+; GFX906-NEXT:    v_mov_b32_e32 v0, 0
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    global_store_dwordx2 v0, v[1:2], s[4:5]
+; GFX906-NEXT:  .LBB8_4: ; %bb.3
+; GFX906-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX906-NEXT:    v_mov_b32_e32 v0, 0
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    global_store_dwordx2 v0, v[1:2], s[6:7]
+; GFX906-NEXT:    s_endpgm
+entry:
+  %idx = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx
+  %vec1 = load <8 x i8>, ptr addrspace(1) %gep1
+  %gep2 = getelementptr <8 x i8>, ptr addrspace(1) %src2, i32 %idx
+  %vec2 = load <8 x i8>, ptr addrspace(1) %gep2
+  %cmp = icmp ult i32 %idx, 15
+  br i1 %cmp, label %bb.1, label %bb.2
+bb.1:
+  %cmp2 = icmp ult i32 %idx, 7
+  br i1 %cmp2, label %bb.2, label %bb.3
+
+bb.2:
+  %tmp5 = phi <8 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ]
+  store <8 x i8> %tmp5, ptr addrspace(1) %dst0, align 4
+  br label %bb.3
+
+bb.3:
+  %tmp7 = phi <8 x i8> [ %vec2, %bb.1], [%tmp5, %bb.2]
+  store <8 x i8> %tmp7, ptr addrspace(1) %dst1, align 4
+  ret void
+}
+
+define amdgpu_kernel void @v8i8_multi_block(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst0, ptr addrspace(1) nocapture %dst1) {
+; GFX906-LABEL: v8i8_multi_block:
+; GFX906:       ; %bb.0: ; %entry
+; GFX906-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX906-NEXT:    v_lshlrev_b32_e32 v6, 3, v0
+; GFX906-NEXT:    v_mov_b32_e32 v5, 0
+; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v0
+; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX906-NEXT:    global_load_dwordx2 v[3:4], v6, s[0:1]
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_mov_b32_e32 v1, v3
+; GFX906-NEXT:    v_mov_b32_e32 v2, v4
+; GFX906-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX906-NEXT:    s_cbranch_execz .LBB9_4
+; GFX906-NEXT:  ; %bb.1: ; %bb.1
+; GFX906-NEXT:    global_load_dwordx2 v[1:2], v6, s[2:3]
+; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 7, v0
+; GFX906-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX906-NEXT:    s_cbranch_execz .LBB9_3
+; GFX906-NEXT:  ; %bb.2: ; %bb.2
+; GFX906-NEXT:    v_mov_b32_e32 v0, 0
+; GFX906-NEXT:    global_store_dwordx2 v0, v[3:4], s[4:5]
+; GFX906-NEXT:  .LBB9_3: ; %Flow
+; GFX906-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX906-NEXT:  .LBB9_4: ; %bb.3
+; GFX906-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    global_store_dwordx2 v5, v[1:2], s[6:7]
+; GFX906-NEXT:    s_endpgm
+entry:
+  %idx = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx
+  %vec1 = load <8 x i8>, ptr addrspace(1) %gep1
+  %gep2 = getelementptr <8 x i8>, ptr addrspace(1) %src2, i32 %idx
+  %vec2 = load <8 x i8>, ptr addrspace(1) %gep2
+  %cmp = icmp ult i32 %idx, 15
+  br i1 %cmp, label %bb.1, label %bb.3
+bb.1:
+  %cmp2 = icmp ult i32 %idx, 7
+  br i1 %cmp2, label %bb.2, label %bb.3
+
+bb.2:
+  store <8 x i8> %vec1, ptr addrspace(1) %dst0, align 4
+  br label %bb.3
+
+bb.3:
+  %tmp5 = phi <8 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ], [ %vec2, %bb.2]
+  store <8 x i8> %tmp5, ptr addrspace(1) %dst1, align 4
+  ret void
+}
+
+define amdgpu_kernel void @v32i8_loop_carried(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
+; GFX906-LABEL: v32i8_loop_carried:
+; GFX906:       ; %bb.0: ; %entry
+; GFX906-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX906-NEXT:    v_lshlrev_b32_e32 v1, 5, v0
+; GFX906-NEXT:    v_cmp_lt_u32_e32 vcc, 14, v0
+; GFX906-NEXT:    s_mov_b32 s4, 0x2000604
+; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX906-NEXT:    global_load_dword v1, v1, s[2:3]
+; GFX906-NEXT:    s_mov_b64 s[2:3], 0
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_mov_b32_e32 v0, v1
+; GFX906-NEXT:  .LBB10_1: ; %bb.1
+; GFX906-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX906-NEXT:    s_and_b64 s[6:7], exec, vcc
+; GFX906-NEXT:    s_or_b64 s[2:3], s[6:7], s[2:3]
+; GFX906-NEXT:    v_perm_b32 v0, v1, v0, s4
+; GFX906-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX906-NEXT:    s_cbranch_execnz .LBB10_1
+; GFX906-NEXT:  ; %bb.2: ; %bb.2.loopexit
+; GFX906-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX906-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX906-NEXT:    v_mov_b32_e32 v1, 0
+; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX906-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX906-NEXT:    s_endpgm
+entry:
+  %idx = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep1 = getelementptr <32 x i8>, ptr addrspace(1) %src1, i32 %idx
+  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
+  br label %bb.1
+
+bb.1:
+  %temp = phi <4 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ]
+  %vec2 = shufflevector <4 x i8> %vec1, <4 x i8> %temp, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %cmp = icmp ult i32 %idx, 15
+  br i1 %cmp, label %bb.1, label %bb.2
+  br label %bb.2
+
+bb.2:
+  store <4 x i8> %vec2, ptr addrspace(1) %dst, align 4
+  ret void
+}
+
+
 declare i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/vni8-loop-carried.ll b/llvm/test/CodeGen/AMDGPU/vni8-loop-carried.ll
deleted file mode 100644
index 1020990edecac..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/vni8-loop-carried.ll
+++ /dev/null
@@ -1,46 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=amdgcn -mcpu=gfx906 < %s | FileCheck --check-prefix=GFX906 %s
-
-define amdgpu_kernel void @v32i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
-; GFX906-LABEL: v32i8_liveout:
-; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX906-NEXT:    v_lshlrev_b32_e32 v1, 5, v0
-; GFX906-NEXT:    v_cmp_lt_u32_e32 vcc, 14, v0
-; GFX906-NEXT:    s_mov_b32 s4, 0x2000604
-; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX906-NEXT:    global_load_dword v1, v1, s[2:3]
-; GFX906-NEXT:    s_mov_b64 s[2:3], 0
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_mov_b32_e32 v0, v1
-; GFX906-NEXT:  .LBB0_1: ; %bb.1
-; GFX906-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX906-NEXT:    s_and_b64 s[6:7], exec, vcc
-; GFX906-NEXT:    s_or_b64 s[2:3], s[6:7], s[2:3]
-; GFX906-NEXT:    v_perm_b32 v0, v1, v0, s4
-; GFX906-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX906-NEXT:    s_cbranch_execnz .LBB0_1
-; GFX906-NEXT:  ; %bb.2: ; %bb.2.loopexit
-; GFX906-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX906-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX906-NEXT:    v_mov_b32_e32 v1, 0
-; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX906-NEXT:    global_store_dword v1, v0, s[0:1]
-; GFX906-NEXT:    s_endpgm
-entry:
-  %idx = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep1 = getelementptr <32 x i8>, ptr addrspace(1) %src1, i32 %idx
-  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
-  br label %bb.1
-
-bb.1:
-  %temp = phi <4 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ]
-  %vec2 = shufflevector <4 x i8> %vec1, <4 x i8> %temp, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-  %cmp = icmp ult i32 %idx, 15
-  br i1 %cmp, label %bb.1, label %bb.2
-  br label %bb.2
-
-bb.2:
-  store <4 x i8> %vec2, ptr addrspace(1) %dst, align 4
-  ret void
-}

>From 7a21f74bbc7f0c8af1c584f26b984d1221bbc235 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Wed, 12 Jun 2024 14:02:43 -0700
Subject: [PATCH 13/17] Review comments

Change-Id: I1ec3c042ed404285b14f57e1758bbce7fb1dd7c4
---
 .../AMDGPU/AMDGPULateCodeGenPrepare.cpp       |  78 +--
 .../CodeGen/AMDGPU/extract-subvector-16bit.ll | 359 +++++++-------
 llvm/test/CodeGen/AMDGPU/extract-subvector.ll | 466 ++++++++++++++++++
 3 files changed, 680 insertions(+), 223 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
index 78bff7ffcacd6..4384dbb835ca2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
@@ -87,6 +87,7 @@ class LiveRegOptimizer {
 private:
   Module *Mod = nullptr;
   const DataLayout *DL = nullptr;
+  const GCNSubtarget *ST;
   /// The scalar type to convert to
   Type *ConvertToScalar;
   /// The set of visited Instructions
@@ -117,7 +118,29 @@ class LiveRegOptimizer {
   /// Remove all instructions that have become dead (i.e. all the re-typed PHIs)
   void removeDeadInstrs();
 
-  LiveRegOptimizer(Module *Mod) : Mod(Mod) {
+  // Whether or not the type should be replaced to avoid inefficient
+  // legalization code
+  bool shouldReplace(Type *ITy) {
+    FixedVectorType *VTy = dyn_cast<FixedVectorType>(ITy);
+    if (!VTy)
+      return false;
+
+    auto TLI = ST->getTargetLowering();
+
+    Type *EltTy = VTy->getElementType();
+    // If the element size is not less than the convert to scalar size, then we
+    // can't do any bit packing
+    if (!EltTy->isIntegerTy() || EltTy->getScalarSizeInBits() >
+                                     -(ConvertToScalar->getScalarSizeInBits()))
+      return false;
+
+    // Only coerce illegal types
+    TargetLoweringBase::LegalizeKind LK =
+        TLI->getTypeConversion(EltTy->getContext(), EVT::getEVT(EltTy, false));
+    return LK.first != TargetLoweringBase::TypeLegal;
+  }
+
+  LiveRegOptimizer(Module *Mod, const GCNSubtarget *ST) : Mod(Mod), ST(ST) {
     DL = &Mod->getDataLayout();
     ConvertToScalar = Type::getInt32Ty(Mod->getContext());
   }
@@ -151,7 +174,7 @@ bool AMDGPULateCodeGenPrepare::runOnFunction(Function &F) {
   // vectors to equivalent vectors of legal type (which are converted back
   // before uses in subsequent blocks), to pack the bits into fewer physical
   // registers (used in CopyToReg/CopyFromReg pairs).
-  LiveRegOptimizer LRO(Mod);
+  LiveRegOptimizer LRO(Mod, &ST);
 
   bool Changed = false;
 
@@ -169,9 +192,7 @@ Type *LiveRegOptimizer::calculateConvertType(Type *OriginalType) {
   assert(OriginalType->getScalarSizeInBits() <=
          ConvertToScalar->getScalarSizeInBits());
 
-  FixedVectorType *VTy = dyn_cast<FixedVectorType>(OriginalType);
-  if (!VTy)
-    return nullptr;
+  FixedVectorType *VTy = cast<FixedVectorType>(OriginalType);
 
   TypeSize OriginalSize = DL->getTypeSizeInBits(VTy);
   TypeSize ConvertScalarSize = DL->getTypeSizeInBits(ConvertToScalar);
@@ -182,15 +203,13 @@ Type *LiveRegOptimizer::calculateConvertType(Type *OriginalType) {
     return IntegerType::get(Mod->getContext(), ConvertScalarSize);
 
   return VectorType::get(Type::getIntNTy(Mod->getContext(), ConvertScalarSize),
-                         ElementCount::getFixed(ConvertEltCount));
+                         ConvertEltCount, false);
 }
 
 Value *LiveRegOptimizer::convertToOptType(Instruction *V,
                                           BasicBlock::iterator &InsertPt) {
-  VectorType *VTy = cast<VectorType>(V->getType());
+  FixedVectorType *VTy = cast<FixedVectorType>(V->getType());
   Type *NewTy = calculateConvertType(V->getType());
-  if (!NewTy)
-    return nullptr;
 
   TypeSize OriginalSize = DL->getTypeSizeInBits(VTy);
   TypeSize NewSize = DL->getTypeSizeInBits(NewTy);
@@ -204,15 +223,14 @@ Value *LiveRegOptimizer::convertToOptType(Instruction *V,
 
   // If there is a bitsize mismatch, we must use a wider vector.
   assert(NewSize > OriginalSize);
-  ElementCount ExpandedVecElementCount =
-      ElementCount::getFixed(NewSize / VTy->getScalarSizeInBits());
+  uint64_t ExpandedVecElementCount = NewSize / VTy->getScalarSizeInBits();
 
   SmallVector<int, 8> ShuffleMask;
   for (unsigned I = 0; I < VTy->getElementCount().getFixedValue(); I++)
     ShuffleMask.push_back(I);
 
   for (uint64_t I = VTy->getElementCount().getFixedValue();
-       I < ExpandedVecElementCount.getFixedValue(); I++)
+       I < ExpandedVecElementCount; I++)
     ShuffleMask.push_back(VTy->getElementCount().getFixedValue());
 
   Instruction *ExpandedVec =
@@ -224,17 +242,16 @@ Value *LiveRegOptimizer::convertToOptType(Instruction *V,
 Value *LiveRegOptimizer::convertFromOptType(Type *ConvertType, Instruction *V,
                                             BasicBlock::iterator &InsertPt,
                                             BasicBlock *InsertBB) {
-  VectorType *NewVTy = cast<VectorType>(ConvertType);
+  FixedVectorType *NewVTy = cast<FixedVectorType>(ConvertType);
 
   TypeSize OriginalSize = DL->getTypeSizeInBits(V->getType());
   TypeSize NewSize = DL->getTypeSizeInBits(NewVTy);
 
   IRBuilder<> Builder(InsertBB, InsertPt);
   // If there is a bitsize match, we simply convert back to the original type.
-  if (OriginalSize == NewSize) {
+  if (OriginalSize == NewSize)
     return cast<Instruction>(
         Builder.CreateBitCast(V, NewVTy, V->getName() + ".bc"));
-  }
 
   // If there is a bitsize mismatch, then we must have used a wider value to
   // hold the bits.
@@ -248,11 +265,9 @@ Value *LiveRegOptimizer::convertFromOptType(Type *ConvertType, Instruction *V,
 
   // For wider vectors, we must strip the MSBs to convert back to the original
   // type.
-  ElementCount ExpandedVecElementCount =
-      ElementCount::getFixed(OriginalSize / NewVTy->getScalarSizeInBits());
   VectorType *ExpandedVT = VectorType::get(
       Type::getIntNTy(Mod->getContext(), NewVTy->getScalarSizeInBits()),
-      ExpandedVecElementCount);
+      (OriginalSize / NewVTy->getScalarSizeInBits()), false);
   Instruction *Converted =
       cast<Instruction>(Builder.CreateBitCast(V, ExpandedVT));
 
@@ -272,31 +287,25 @@ bool LiveRegOptimizer::optimizeLiveType(Instruction *I) {
   Worklist.push_back(cast<Instruction>(I));
   while (!Worklist.empty()) {
     Instruction *II = Worklist.pop_back_val();
-    if (Visited.count(II))
-      continue;
-    Visited.insert(II);
 
-    Type *ITy = II->getType();
-    // Only vectors of illegal type will be scalarized when building the
-    // selection DAG.
-    bool ShouldReplace = ITy->isVectorTy() && ITy->getScalarSizeInBits() < 16 &&
-                         !ITy->getScalarType()->isPointerTy();
+    if (!Visited.insert(II).second)
+      continue;
 
-    if (!ShouldReplace)
+    if (!shouldReplace(II->getType()))
       continue;
 
-    if (auto *Phi = dyn_cast<PHINode>(II)) {
+    if (PHINode *Phi = dyn_cast<PHINode>(II)) {
       PhiNodes.insert(Phi);
       // Collect all the incoming values of problematic PHI nodes.
       for (Value *V : Phi->incoming_values()) {
         // Repeat the collection process for newly found PHI nodes.
-        if (auto *OpPhi = dyn_cast<PHINode>(V)) {
+        if (PHINode *OpPhi = dyn_cast<PHINode>(V)) {
           if (!PhiNodes.count(OpPhi) && !Visited.count(OpPhi))
             Worklist.push_back(OpPhi);
           continue;
         }
 
-        auto IncInst = dyn_cast<Instruction>(V);
+        Instruction *IncInst = dyn_cast<Instruction>(V);
         if (!IncInst)
           return false;
 
@@ -308,13 +317,13 @@ bool LiveRegOptimizer::optimizeLiveType(Instruction *I) {
     // Collect all relevant uses.
     for (User *V : II->users()) {
       // Repeat the collection process for problematic PHI nodes.
-      if (auto *OpPhi = dyn_cast<PHINode>(V)) {
+      if (PHINode *OpPhi = dyn_cast<PHINode>(V)) {
         if (!PhiNodes.count(OpPhi) && !Visited.count(OpPhi))
           Worklist.push_back(OpPhi);
         continue;
       }
 
-      auto UseInst = cast<Instruction>(V);
+      Instruction *UseInst = cast<Instruction>(V);
       // Collect all uses of PHINodes and any use the crosses BB boundaries.
       if (UseInst->getParent() != II->getParent() || isa<PHINode>(II)) {
         Uses.insert(UseInst);
@@ -336,12 +345,11 @@ bool LiveRegOptimizer::optimizeLiveType(Instruction *I) {
   }
 
   // Construct new-typed PHI nodes.
-  for (PHINode *Phi : PhiNodes) {
-    assert(calculateConvertType(Phi->getType()));
+  for (PHINode *Phi : PhiNodes)
     ValMap[Phi] = PHINode::Create(calculateConvertType(Phi->getType()),
                                   Phi->getNumIncomingValues(),
                                   Phi->getName() + ".tc", Phi->getIterator());
-  }
+
   // Connect all the PHI nodes with their new incoming values.
   for (PHINode *Phi : PhiNodes) {
     PHINode *NewPhi = cast<PHINode>(ValMap[Phi]);
diff --git a/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll b/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll
index 6dabd8c0b83ea..efbbe2b27f10f 100644
--- a/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll
@@ -13,9 +13,9 @@ define <4 x i16> @vec_8xi16_extract_4xi16(ptr addrspace(1) %p0, ptr addrspace(1)
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
 ; SI-NEXT:    s_mov_b32 s5, s6
-; SI-NEXT:    buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 glc
+; SI-NEXT:    buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:2 glc
+; SI-NEXT:    buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 offset:2 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_load_ushort v6, v[2:3], s[4:7], 0 addr64 offset:4 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
@@ -30,27 +30,25 @@ define <4 x i16> @vec_8xi16_extract_4xi16(ptr addrspace(1) %p0, ptr addrspace(1)
 ; SI-NEXT:    buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 offset:14 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v7
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
-; SI-NEXT:    v_or_b32_e32 v2, v6, v2
-; SI-NEXT:    v_or_b32_e32 v3, v5, v3
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; SI-NEXT:    v_or_b32_e32 v3, v6, v2
+; SI-NEXT:    v_or_b32_e32 v2, v4, v5
 ; SI-NEXT:    s_mov_b64 vcc, exec
 ; SI-NEXT:    s_cbranch_execz .LBB0_3
 ; SI-NEXT:    s_branch .LBB0_4
 ; SI-NEXT:  .LBB0_2:
-; SI-NEXT:    ; implicit-def: $vgpr3
-; SI-NEXT:    ; implicit-def: $vgpr4
-; SI-NEXT:    ; implicit-def: $vgpr2
+; SI-NEXT:    ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5
 ; SI-NEXT:    s_mov_b64 vcc, 0
 ; SI-NEXT:  .LBB0_3: ; %T
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
 ; SI-NEXT:    s_mov_b32 s5, s6
-; SI-NEXT:    buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 glc
+; SI-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:2 glc
+; SI-NEXT:    buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:2 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:4 glc
+; SI-NEXT:    buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:4 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:6 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
@@ -63,29 +61,29 @@ define <4 x i16> @vec_8xi16_extract_4xi16(ptr addrspace(1) %p0, ptr addrspace(1)
 ; SI-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:14 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v5
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v4
-; SI-NEXT:    v_or_b32_e32 v2, v2, v0
-; SI-NEXT:    v_or_b32_e32 v3, v3, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
+; SI-NEXT:    v_or_b32_e32 v3, v4, v0
+; SI-NEXT:    v_or_b32_e32 v2, v2, v1
 ; SI-NEXT:  .LBB0_4: ; %exit
-; SI-NEXT:    v_bfe_i32 v0, v3, 0, 16
-; SI-NEXT:    v_bfe_i32 v1, v4, 0, 16
-; SI-NEXT:    v_bfe_i32 v2, v2, 0, 16
-; SI-NEXT:    v_mov_b32_e32 v3, 0xffff
-; SI-NEXT:    v_mov_b32_e32 v4, 0x8000
-; SI-NEXT:    v_mov_b32_e32 v5, 0xffff0000
-; SI-NEXT:    v_bfrev_b32_e32 v6, 1
+; SI-NEXT:    v_ashrrev_i32_e32 v0, 16, v2
+; SI-NEXT:    v_bfe_i32 v1, v2, 0, 16
+; SI-NEXT:    v_bfe_i32 v2, v3, 0, 16
+; SI-NEXT:    v_mov_b32_e32 v3, 0xffff0000
+; SI-NEXT:    v_bfrev_b32_e32 v4, 1
+; SI-NEXT:    v_mov_b32_e32 v5, 0xffff
+; SI-NEXT:    v_mov_b32_e32 v6, 0x8000
 ; SI-NEXT:    v_mov_b32_e32 v7, 0xffff8000
 ; SI-NEXT:    v_cmp_lt_i32_e32 vcc, -1, v0
-; SI-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v4, v3, v4, vcc
 ; SI-NEXT:    v_cmp_lt_i32_e32 vcc, -1, v1
-; SI-NEXT:    v_cndmask_b32_e32 v1, v5, v6, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v0, v5, v6, vcc
 ; SI-NEXT:    v_cmp_lt_i32_e32 vcc, -1, v2
-; SI-NEXT:    v_cndmask_b32_e32 v2, -1, v7, vcc
-; SI-NEXT:    v_or_b32_e32 v0, v0, v1
-; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v2
-; SI-NEXT:    v_or_b32_e32 v2, v3, v4
-; SI-NEXT:    v_alignbit_b32 v1, v2, v1, 16
+; SI-NEXT:    v_cndmask_b32_e32 v1, -1, v7, vcc
+; SI-NEXT:    v_or_b32_e32 v0, v0, v4
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v1
+; SI-NEXT:    v_or_b32_e32 v2, v3, v2
+; SI-NEXT:    v_alignbit_b32 v1, v2, v4, 16
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: vec_8xi16_extract_4xi16:
@@ -180,26 +178,23 @@ define <4 x i16> @vec_8xi16_extract_4xi16_2(ptr addrspace(1) %p0, ptr addrspace(
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:6 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 offset:8 glc
+; SI-NEXT:    buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:8 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:10 glc
+; SI-NEXT:    buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 offset:10 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_load_ushort v6, v[2:3], s[4:7], 0 addr64 offset:12 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 offset:14 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v4
-; SI-NEXT:    v_or_b32_e32 v3, v6, v3
-; SI-NEXT:    v_or_b32_e32 v5, v5, v7
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
+; SI-NEXT:    v_or_b32_e32 v5, v6, v2
+; SI-NEXT:    v_or_b32_e32 v4, v4, v3
 ; SI-NEXT:    s_mov_b64 vcc, exec
 ; SI-NEXT:    s_cbranch_execz .LBB1_3
 ; SI-NEXT:    s_branch .LBB1_4
 ; SI-NEXT:  .LBB1_2:
-; SI-NEXT:    ; implicit-def: $vgpr5
-; SI-NEXT:    ; implicit-def: $vgpr4
-; SI-NEXT:    ; implicit-def: $vgpr3
-; SI-NEXT:    ; implicit-def: $vgpr2
+; SI-NEXT:    ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5
 ; SI-NEXT:    s_mov_b64 vcc, 0
 ; SI-NEXT:  .LBB1_3: ; %T
 ; SI-NEXT:    s_mov_b32 s6, 0
@@ -214,39 +209,39 @@ define <4 x i16> @vec_8xi16_extract_4xi16_2(ptr addrspace(1) %p0, ptr addrspace(
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:6 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:8 glc
+; SI-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:8 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:10 glc
+; SI-NEXT:    buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:10 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:12 glc
+; SI-NEXT:    buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:12 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:14 glc
+; SI-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:14 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v4
-; SI-NEXT:    v_or_b32_e32 v3, v3, v0
-; SI-NEXT:    v_or_b32_e32 v5, v5, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
+; SI-NEXT:    v_or_b32_e32 v5, v4, v0
+; SI-NEXT:    v_or_b32_e32 v4, v2, v1
 ; SI-NEXT:  .LBB1_4: ; %exit
-; SI-NEXT:    v_bfe_i32 v0, v5, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v2, 16, v4
+; SI-NEXT:    v_ashr_i64 v[0:1], v[4:5], 48
 ; SI-NEXT:    v_bfe_i32 v1, v4, 0, 16
-; SI-NEXT:    v_bfe_i32 v3, v3, 0, 16
-; SI-NEXT:    v_bfe_i32 v2, v2, 0, 16
-; SI-NEXT:    v_mov_b32_e32 v4, 0xffff
-; SI-NEXT:    v_mov_b32_e32 v5, 0x8000
-; SI-NEXT:    v_mov_b32_e32 v6, 0xffff0000
-; SI-NEXT:    v_bfrev_b32_e32 v7, 1
-; SI-NEXT:    v_cmp_lt_i32_e32 vcc, -1, v0
-; SI-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
+; SI-NEXT:    v_bfe_i32 v3, v5, 0, 16
+; SI-NEXT:    v_mov_b32_e32 v4, 0xffff0000
+; SI-NEXT:    v_bfrev_b32_e32 v5, 1
+; SI-NEXT:    v_mov_b32_e32 v6, 0xffff
+; SI-NEXT:    v_mov_b32_e32 v7, 0x8000
+; SI-NEXT:    v_cmp_lt_i32_e32 vcc, -1, v2
+; SI-NEXT:    v_cndmask_b32_e32 v8, v4, v5, vcc
 ; SI-NEXT:    v_cmp_lt_i32_e32 vcc, -1, v1
 ; SI-NEXT:    v_cndmask_b32_e32 v1, v6, v7, vcc
 ; SI-NEXT:    v_cmp_lt_i32_e32 vcc, -1, v3
+; SI-NEXT:    v_cndmask_b32_e32 v2, v6, v7, vcc
+; SI-NEXT:    v_cmp_lt_i32_e32 vcc, -1, v0
 ; SI-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
-; SI-NEXT:    v_cmp_lt_i32_e32 vcc, -1, v2
-; SI-NEXT:    v_cndmask_b32_e32 v4, v6, v7, vcc
-; SI-NEXT:    v_or_b32_e32 v0, v0, v1
-; SI-NEXT:    v_or_b32_e32 v2, v3, v4
-; SI-NEXT:    v_alignbit_b32 v1, v2, v1, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v4
+; SI-NEXT:    v_or_b32_e32 v0, v1, v8
+; SI-NEXT:    v_or_b32_e32 v2, v2, v3
+; SI-NEXT:    v_alignbit_b32 v1, v2, v8, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: vec_8xi16_extract_4xi16_2:
@@ -499,9 +494,9 @@ define <4 x i16> @vec_16xi16_extract_4xi16(ptr addrspace(1) %p0, ptr addrspace(1
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
 ; SI-NEXT:    s_mov_b32 s5, s6
-; SI-NEXT:    buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 glc
+; SI-NEXT:    buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:2 glc
+; SI-NEXT:    buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 offset:2 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_load_ushort v6, v[2:3], s[4:7], 0 addr64 offset:4 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
@@ -532,27 +527,25 @@ define <4 x i16> @vec_16xi16_extract_4xi16(ptr addrspace(1) %p0, ptr addrspace(1
 ; SI-NEXT:    buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 offset:30 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v7
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
-; SI-NEXT:    v_or_b32_e32 v2, v6, v2
-; SI-NEXT:    v_or_b32_e32 v3, v5, v3
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; SI-NEXT:    v_or_b32_e32 v3, v6, v2
+; SI-NEXT:    v_or_b32_e32 v2, v4, v5
 ; SI-NEXT:    s_mov_b64 vcc, exec
 ; SI-NEXT:    s_cbranch_execz .LBB3_3
 ; SI-NEXT:    s_branch .LBB3_4
 ; SI-NEXT:  .LBB3_2:
-; SI-NEXT:    ; implicit-def: $vgpr3
-; SI-NEXT:    ; implicit-def: $vgpr4
-; SI-NEXT:    ; implicit-def: $vgpr2
+; SI-NEXT:    ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
 ; SI-NEXT:    s_mov_b64 vcc, 0
 ; SI-NEXT:  .LBB3_3: ; %T
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
 ; SI-NEXT:    s_mov_b32 s5, s6
-; SI-NEXT:    buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 glc
+; SI-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:2 glc
+; SI-NEXT:    buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:2 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:4 glc
+; SI-NEXT:    buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:4 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:6 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
@@ -581,29 +574,29 @@ define <4 x i16> @vec_16xi16_extract_4xi16(ptr addrspace(1) %p0, ptr addrspace(1
 ; SI-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:30 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v5
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v4
-; SI-NEXT:    v_or_b32_e32 v2, v2, v0
-; SI-NEXT:    v_or_b32_e32 v3, v3, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
+; SI-NEXT:    v_or_b32_e32 v3, v4, v0
+; SI-NEXT:    v_or_b32_e32 v2, v2, v1
 ; SI-NEXT:  .LBB3_4: ; %exit
-; SI-NEXT:    v_bfe_i32 v0, v3, 0, 16
-; SI-NEXT:    v_bfe_i32 v1, v4, 0, 16
-; SI-NEXT:    v_bfe_i32 v2, v2, 0, 16
-; SI-NEXT:    v_mov_b32_e32 v3, 0xffff
-; SI-NEXT:    v_mov_b32_e32 v4, 0x8000
-; SI-NEXT:    v_mov_b32_e32 v5, 0xffff0000
-; SI-NEXT:    v_bfrev_b32_e32 v6, 1
+; SI-NEXT:    v_ashrrev_i32_e32 v0, 16, v2
+; SI-NEXT:    v_bfe_i32 v1, v2, 0, 16
+; SI-NEXT:    v_bfe_i32 v2, v3, 0, 16
+; SI-NEXT:    v_mov_b32_e32 v3, 0xffff0000
+; SI-NEXT:    v_bfrev_b32_e32 v4, 1
+; SI-NEXT:    v_mov_b32_e32 v5, 0xffff
+; SI-NEXT:    v_mov_b32_e32 v6, 0x8000
 ; SI-NEXT:    v_mov_b32_e32 v7, 0xffff8000
 ; SI-NEXT:    v_cmp_lt_i32_e32 vcc, -1, v0
-; SI-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v4, v3, v4, vcc
 ; SI-NEXT:    v_cmp_lt_i32_e32 vcc, -1, v1
-; SI-NEXT:    v_cndmask_b32_e32 v1, v5, v6, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v0, v5, v6, vcc
 ; SI-NEXT:    v_cmp_lt_i32_e32 vcc, -1, v2
-; SI-NEXT:    v_cndmask_b32_e32 v2, -1, v7, vcc
-; SI-NEXT:    v_or_b32_e32 v0, v0, v1
-; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v2
-; SI-NEXT:    v_or_b32_e32 v2, v3, v4
-; SI-NEXT:    v_alignbit_b32 v1, v2, v1, 16
+; SI-NEXT:    v_cndmask_b32_e32 v1, -1, v7, vcc
+; SI-NEXT:    v_or_b32_e32 v0, v0, v4
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v1
+; SI-NEXT:    v_or_b32_e32 v2, v3, v2
+; SI-NEXT:    v_alignbit_b32 v1, v2, v4, 16
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: vec_16xi16_extract_4xi16:
@@ -710,13 +703,13 @@ define <4 x i16> @vec_16xi16_extract_4xi16_2(ptr addrspace(1) %p0, ptr addrspace
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:6 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_ushort v6, v[2:3], s[4:7], 0 addr64 offset:8 glc
+; SI-NEXT:    buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:8 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:10 glc
+; SI-NEXT:    buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 offset:10 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_ushort v7, v[2:3], s[4:7], 0 addr64 offset:12 glc
+; SI-NEXT:    buffer_load_ushort v6, v[2:3], s[4:7], 0 addr64 offset:12 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 offset:14 glc
+; SI-NEXT:    buffer_load_ushort v7, v[2:3], s[4:7], 0 addr64 offset:14 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:16 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
@@ -734,18 +727,15 @@ define <4 x i16> @vec_16xi16_extract_4xi16_2(ptr addrspace(1) %p0, ptr addrspace
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 offset:30 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
-; SI-NEXT:    v_or_b32_e32 v2, v7, v2
-; SI-NEXT:    v_or_b32_e32 v3, v6, v3
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v7
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
+; SI-NEXT:    v_or_b32_e32 v5, v6, v2
+; SI-NEXT:    v_or_b32_e32 v4, v4, v3
 ; SI-NEXT:    s_mov_b64 vcc, exec
 ; SI-NEXT:    s_cbranch_execz .LBB4_3
 ; SI-NEXT:    s_branch .LBB4_4
 ; SI-NEXT:  .LBB4_2:
-; SI-NEXT:    ; implicit-def: $vgpr3
-; SI-NEXT:    ; implicit-def: $vgpr4
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; implicit-def: $vgpr5
+; SI-NEXT:    ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
 ; SI-NEXT:    s_mov_b64 vcc, 0
 ; SI-NEXT:  .LBB4_3: ; %T
 ; SI-NEXT:    s_mov_b32 s6, 0
@@ -760,11 +750,11 @@ define <4 x i16> @vec_16xi16_extract_4xi16_2(ptr addrspace(1) %p0, ptr addrspace
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:6 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:8 glc
+; SI-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:8 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:10 glc
+; SI-NEXT:    buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:10 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:12 glc
+; SI-NEXT:    buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:12 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:14 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
@@ -785,29 +775,29 @@ define <4 x i16> @vec_16xi16_extract_4xi16_2(ptr addrspace(1) %p0, ptr addrspace
 ; SI-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:30 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v5
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v4
-; SI-NEXT:    v_or_b32_e32 v2, v2, v0
-; SI-NEXT:    v_or_b32_e32 v3, v3, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
+; SI-NEXT:    v_or_b32_e32 v5, v4, v0
+; SI-NEXT:    v_or_b32_e32 v4, v2, v1
 ; SI-NEXT:  .LBB4_4: ; %exit
-; SI-NEXT:    v_bfe_i32 v0, v3, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v2, 16, v4
+; SI-NEXT:    v_ashr_i64 v[0:1], v[4:5], 48
 ; SI-NEXT:    v_bfe_i32 v1, v4, 0, 16
-; SI-NEXT:    v_bfe_i32 v2, v2, 0, 16
 ; SI-NEXT:    v_bfe_i32 v3, v5, 0, 16
-; SI-NEXT:    v_mov_b32_e32 v4, 0xffff
-; SI-NEXT:    v_mov_b32_e32 v5, 0x8000
-; SI-NEXT:    v_mov_b32_e32 v6, 0xffff0000
-; SI-NEXT:    v_bfrev_b32_e32 v7, 1
-; SI-NEXT:    v_cmp_lt_i32_e32 vcc, -1, v0
-; SI-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
+; SI-NEXT:    v_mov_b32_e32 v4, 0xffff0000
+; SI-NEXT:    v_bfrev_b32_e32 v5, 1
+; SI-NEXT:    v_mov_b32_e32 v6, 0xffff
+; SI-NEXT:    v_mov_b32_e32 v7, 0x8000
+; SI-NEXT:    v_cmp_lt_i32_e32 vcc, -1, v2
+; SI-NEXT:    v_cndmask_b32_e32 v8, v4, v5, vcc
 ; SI-NEXT:    v_cmp_lt_i32_e32 vcc, -1, v1
 ; SI-NEXT:    v_cndmask_b32_e32 v1, v6, v7, vcc
-; SI-NEXT:    v_cmp_lt_i32_e32 vcc, -1, v2
-; SI-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc
 ; SI-NEXT:    v_cmp_lt_i32_e32 vcc, -1, v3
-; SI-NEXT:    v_cndmask_b32_e32 v3, v6, v7, vcc
-; SI-NEXT:    v_or_b32_e32 v0, v0, v1
+; SI-NEXT:    v_cndmask_b32_e32 v2, v6, v7, vcc
+; SI-NEXT:    v_cmp_lt_i32_e32 vcc, -1, v0
+; SI-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
+; SI-NEXT:    v_or_b32_e32 v0, v1, v8
 ; SI-NEXT:    v_or_b32_e32 v2, v2, v3
-; SI-NEXT:    v_alignbit_b32 v1, v2, v1, 16
+; SI-NEXT:    v_alignbit_b32 v1, v2, v8, 16
 ; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1205,21 +1195,21 @@ define amdgpu_gfx <8 x i16> @vec_16xi16_extract_8xi16_0(i1 inreg %cond, ptr addr
 ; SI-NEXT:    s_mov_b32 s39, 0xf000
 ; SI-NEXT:    s_mov_b32 s36, s38
 ; SI-NEXT:    s_mov_b32 s37, s38
-; SI-NEXT:    buffer_load_ushort v9, v[2:3], s[36:39], 0 addr64 glc
+; SI-NEXT:    buffer_load_ushort v6, v[2:3], s[36:39], 0 addr64 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_ushort v5, v[2:3], s[36:39], 0 addr64 offset:2 glc
+; SI-NEXT:    buffer_load_ushort v4, v[2:3], s[36:39], 0 addr64 offset:2 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_ushort v10, v[2:3], s[36:39], 0 addr64 offset:4 glc
+; SI-NEXT:    buffer_load_ushort v7, v[2:3], s[36:39], 0 addr64 offset:4 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_ushort v4, v[2:3], s[36:39], 0 addr64 offset:6 glc
+; SI-NEXT:    buffer_load_ushort v5, v[2:3], s[36:39], 0 addr64 offset:6 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_load_ushort v8, v[2:3], s[36:39], 0 addr64 offset:8 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_ushort v6, v[2:3], s[36:39], 0 addr64 offset:10 glc
+; SI-NEXT:    buffer_load_ushort v9, v[2:3], s[36:39], 0 addr64 offset:10 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_ushort v11, v[2:3], s[36:39], 0 addr64 offset:12 glc
+; SI-NEXT:    buffer_load_ushort v10, v[2:3], s[36:39], 0 addr64 offset:12 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_ushort v7, v[2:3], s[36:39], 0 addr64 offset:14 glc
+; SI-NEXT:    buffer_load_ushort v11, v[2:3], s[36:39], 0 addr64 offset:14 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_load_ushort v12, v[2:3], s[36:39], 0 addr64 offset:16 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
@@ -1237,46 +1227,39 @@ define amdgpu_gfx <8 x i16> @vec_16xi16_extract_8xi16_0(i1 inreg %cond, ptr addr
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_load_ushort v2, v[2:3], s[36:39], 0 addr64 offset:30 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v7
-; SI-NEXT:    v_lshlrev_b32_e32 v12, 16, v6
-; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v4
-; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v5
-; SI-NEXT:    v_or_b32_e32 v3, v11, v2
-; SI-NEXT:    v_or_b32_e32 v8, v8, v12
-; SI-NEXT:    v_or_b32_e32 v2, v10, v13
-; SI-NEXT:    v_or_b32_e32 v9, v9, v14
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v11
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v9
+; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
+; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v4
+; SI-NEXT:    v_or_b32_e32 v5, v10, v2
+; SI-NEXT:    v_or_b32_e32 v4, v8, v3
+; SI-NEXT:    v_or_b32_e32 v3, v7, v9
+; SI-NEXT:    v_or_b32_e32 v2, v6, v11
 ; SI-NEXT:    s_mov_b64 vcc, exec
 ; SI-NEXT:    s_cbranch_execz .LBB7_3
 ; SI-NEXT:    s_branch .LBB7_4
 ; SI-NEXT:  .LBB7_2:
-; SI-NEXT:    ; implicit-def: $vgpr9
-; SI-NEXT:    ; implicit-def: $vgpr5
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; implicit-def: $vgpr4
-; SI-NEXT:    ; implicit-def: $vgpr8
-; SI-NEXT:    ; implicit-def: $vgpr6
-; SI-NEXT:    ; implicit-def: $vgpr3
-; SI-NEXT:    ; implicit-def: $vgpr7
+; SI-NEXT:    ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
 ; SI-NEXT:    s_mov_b64 vcc, 0
 ; SI-NEXT:  .LBB7_3: ; %T
 ; SI-NEXT:    s_mov_b32 s39, 0xf000
 ; SI-NEXT:    s_mov_b32 s36, s38
 ; SI-NEXT:    s_mov_b32 s37, s38
-; SI-NEXT:    buffer_load_ushort v9, v[0:1], s[36:39], 0 addr64 glc
+; SI-NEXT:    buffer_load_ushort v2, v[0:1], s[36:39], 0 addr64 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_ushort v5, v[0:1], s[36:39], 0 addr64 offset:2 glc
+; SI-NEXT:    buffer_load_ushort v3, v[0:1], s[36:39], 0 addr64 offset:2 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_ushort v2, v[0:1], s[36:39], 0 addr64 offset:4 glc
+; SI-NEXT:    buffer_load_ushort v6, v[0:1], s[36:39], 0 addr64 offset:4 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_load_ushort v4, v[0:1], s[36:39], 0 addr64 offset:6 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_ushort v8, v[0:1], s[36:39], 0 addr64 offset:8 glc
+; SI-NEXT:    buffer_load_ushort v7, v[0:1], s[36:39], 0 addr64 offset:8 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_ushort v6, v[0:1], s[36:39], 0 addr64 offset:10 glc
+; SI-NEXT:    buffer_load_ushort v5, v[0:1], s[36:39], 0 addr64 offset:10 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_ushort v3, v[0:1], s[36:39], 0 addr64 offset:12 glc
+; SI-NEXT:    buffer_load_ushort v8, v[0:1], s[36:39], 0 addr64 offset:12 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_ushort v7, v[0:1], s[36:39], 0 addr64 offset:14 glc
+; SI-NEXT:    buffer_load_ushort v9, v[0:1], s[36:39], 0 addr64 offset:14 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_load_ushort v10, v[0:1], s[36:39], 0 addr64 offset:16 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
@@ -1294,52 +1277,52 @@ define amdgpu_gfx <8 x i16> @vec_16xi16_extract_8xi16_0(i1 inreg %cond, ptr addr
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_load_ushort v0, v[0:1], s[36:39], 0 addr64 offset:30 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v7
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v6
-; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v4
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v5
-; SI-NEXT:    v_or_b32_e32 v3, v3, v0
-; SI-NEXT:    v_or_b32_e32 v8, v8, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v9
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v5
+; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v4
+; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v3
+; SI-NEXT:    v_or_b32_e32 v5, v8, v0
+; SI-NEXT:    v_or_b32_e32 v4, v7, v1
+; SI-NEXT:    v_or_b32_e32 v3, v6, v9
 ; SI-NEXT:    v_or_b32_e32 v2, v2, v10
-; SI-NEXT:    v_or_b32_e32 v9, v9, v11
 ; SI-NEXT:  .LBB7_4: ; %exit
-; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v9
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v5
-; SI-NEXT:    v_and_b32_e32 v5, 0xffff, v8
-; SI-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; SI-NEXT:    v_and_b32_e32 v7, 0xffff, v7
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v2
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v4
 ; SI-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v5
+; SI-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; SI-NEXT:    s_movk_i32 s34, 0x3800
-; SI-NEXT:    v_mov_b32_e32 v8, 0x3d00
-; SI-NEXT:    v_mov_b32_e32 v9, 0x3900
-; SI-NEXT:    v_mov_b32_e32 v10, 0x3d000000
-; SI-NEXT:    v_mov_b32_e32 v11, 0x39000000
+; SI-NEXT:    v_mov_b32_e32 v8, 0x3d000000
+; SI-NEXT:    v_mov_b32_e32 v9, 0x39000000
+; SI-NEXT:    v_mov_b32_e32 v10, 0x3d00
+; SI-NEXT:    v_mov_b32_e32 v11, 0x3900
 ; SI-NEXT:    v_cmp_lt_u32_e32 vcc, s34, v0
-; SI-NEXT:    v_cndmask_b32_e32 v0, v8, v9, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v12, v8, v9, vcc
 ; SI-NEXT:    v_cmp_lt_u32_e32 vcc, s34, v1
+; SI-NEXT:    v_cndmask_b32_e32 v0, v10, v11, vcc
+; SI-NEXT:    v_cmp_lt_u32_e32 vcc, s34, v2
+; SI-NEXT:    v_cndmask_b32_e32 v13, v8, v9, vcc
+; SI-NEXT:    v_cmp_lt_u32_e32 vcc, s34, v4
 ; SI-NEXT:    v_cndmask_b32_e32 v1, v10, v11, vcc
+; SI-NEXT:    v_cmp_lt_u32_e32 vcc, s34, v6
+; SI-NEXT:    v_cndmask_b32_e32 v14, v8, v9, vcc
 ; SI-NEXT:    v_cmp_lt_u32_e32 vcc, s34, v5
+; SI-NEXT:    v_cndmask_b32_e32 v2, v10, v11, vcc
+; SI-NEXT:    v_cmp_lt_u32_e32 vcc, s34, v7
 ; SI-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; SI-NEXT:    v_cmp_lt_u32_e32 vcc, s34, v6
-; SI-NEXT:    v_cndmask_b32_e32 v12, v10, v11, vcc
 ; SI-NEXT:    v_cmp_lt_u32_e32 vcc, s34, v3
-; SI-NEXT:    v_cndmask_b32_e32 v3, v8, v9, vcc
-; SI-NEXT:    v_cmp_lt_u32_e32 vcc, s34, v7
-; SI-NEXT:    v_cndmask_b32_e32 v7, v10, v11, vcc
-; SI-NEXT:    v_cmp_lt_u32_e32 vcc, s34, v2
-; SI-NEXT:    v_cndmask_b32_e32 v2, v8, v9, vcc
-; SI-NEXT:    v_cmp_lt_u32_e32 vcc, s34, v4
-; SI-NEXT:    v_cndmask_b32_e32 v8, v10, v11, vcc
-; SI-NEXT:    v_or_b32_e32 v0, v0, v1
-; SI-NEXT:    v_or_b32_e32 v4, v5, v12
-; SI-NEXT:    v_or_b32_e32 v6, v3, v7
-; SI-NEXT:    v_or_b32_e32 v2, v2, v8
-; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v8
-; SI-NEXT:    v_alignbit_b32 v1, v2, v1, 16
-; SI-NEXT:    v_alignbit_b32 v5, v6, v12, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; SI-NEXT:    v_cndmask_b32_e32 v3, v10, v11, vcc
+; SI-NEXT:    v_or_b32_e32 v0, v0, v12
+; SI-NEXT:    v_or_b32_e32 v4, v1, v13
+; SI-NEXT:    v_or_b32_e32 v6, v2, v14
+; SI-NEXT:    v_or_b32_e32 v2, v3, v5
+; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
+; SI-NEXT:    v_alignbit_b32 v1, v2, v12, 16
+; SI-NEXT:    v_alignbit_b32 v5, v6, v13, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v14
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: vec_16xi16_extract_8xi16_0:
diff --git a/llvm/test/CodeGen/AMDGPU/extract-subvector.ll b/llvm/test/CodeGen/AMDGPU/extract-subvector.ll
index 15abf44f3a0ea..0a27767877497 100644
--- a/llvm/test/CodeGen/AMDGPU/extract-subvector.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract-subvector.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
 ; RUN: llc -mtriple=amdgcn-- -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s
 
 ; GCN-LABEL: extract_2xi16
@@ -21,6 +22,81 @@
 ; GCN: v_bfe_i32
 
 define <2 x i16> @extract_2xi16(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i1 %c0) {
+; GCN-LABEL: extract_2xi16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v4, 1, v4
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
+; GCN-NEXT:    s_xor_b64 s[4:5], vcc, -1
+; GCN-NEXT:    ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7
+; GCN-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
+; GCN-NEXT:    s_xor_b64 s[4:5], exec, s[6:7]
+; GCN-NEXT:    s_cbranch_execz .LBB0_2
+; GCN-NEXT:  ; %bb.1: ; %F
+; GCN-NEXT:    s_mov_b32 s10, 0
+; GCN-NEXT:    s_mov_b32 s11, 0xf000
+; GCN-NEXT:    s_mov_b32 s8, s10
+; GCN-NEXT:    s_mov_b32 s9, s10
+; GCN-NEXT:    buffer_load_ushort v0, v[2:3], s[8:11], 0 addr64 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_ushort v1, v[2:3], s[8:11], 0 addr64 offset:2 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_ushort v4, v[2:3], s[8:11], 0 addr64 offset:4 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_ushort v4, v[2:3], s[8:11], 0 addr64 offset:6 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_ushort v4, v[2:3], s[8:11], 0 addr64 offset:8 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_ushort v4, v[2:3], s[8:11], 0 addr64 offset:10 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_ushort v4, v[2:3], s[8:11], 0 addr64 offset:12 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_ushort v2, v[2:3], s[8:11], 0 addr64 offset:14 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT:    v_or_b32_e32 v4, v0, v1
+; GCN-NEXT:    ; implicit-def: $vgpr0
+; GCN-NEXT:  .LBB0_2: ; %Flow
+; GCN-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GCN-NEXT:    s_cbranch_execz .LBB0_4
+; GCN-NEXT:  ; %bb.3: ; %T
+; GCN-NEXT:    s_mov_b32 s10, 0
+; GCN-NEXT:    s_mov_b32 s11, 0xf000
+; GCN-NEXT:    s_mov_b32 s8, s10
+; GCN-NEXT:    s_mov_b32 s9, s10
+; GCN-NEXT:    buffer_load_ushort v2, v[0:1], s[8:11], 0 addr64 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_ushort v3, v[0:1], s[8:11], 0 addr64 offset:2 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_ushort v4, v[0:1], s[8:11], 0 addr64 offset:4 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_ushort v4, v[0:1], s[8:11], 0 addr64 offset:6 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_ushort v4, v[0:1], s[8:11], 0 addr64 offset:8 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_ushort v4, v[0:1], s[8:11], 0 addr64 offset:10 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_ushort v4, v[0:1], s[8:11], 0 addr64 offset:12 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_ushort v0, v[0:1], s[8:11], 0 addr64 offset:14 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v3
+; GCN-NEXT:    v_or_b32_e32 v4, v2, v0
+; GCN-NEXT:  .LBB0_4: ; %exit
+; GCN-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GCN-NEXT:    v_ashrrev_i32_e32 v0, 16, v4
+; GCN-NEXT:    v_bfe_i32 v1, v4, 0, 16
+; GCN-NEXT:    v_mov_b32_e32 v2, 0xffff
+; GCN-NEXT:    v_mov_b32_e32 v3, 0x8000
+; GCN-NEXT:    v_mov_b32_e32 v4, 0xffff8000
+; GCN-NEXT:    v_cmp_lt_i32_e32 vcc, -1, v1
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
+; GCN-NEXT:    v_cmp_lt_i32_e32 vcc, -1, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v2, -1, v4, vcc
+; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
+; GCN-NEXT:    v_or_b32_e32 v0, v1, v0
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff, v2
+; GCN-NEXT:    s_setpc_b64 s[30:31]
   br i1 %c0, label %T, label %F
 
 T:
@@ -42,6 +118,58 @@ exit:
 ; GCN-LABEL: extract_2xi64
 ; GCN-COUNT-2: v_cndmask_b32
 define <2 x i64> @extract_2xi64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i1 %c0) {
+; GCN-LABEL: extract_2xi64:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v4, 1, v4
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
+; GCN-NEXT:    s_xor_b64 s[4:5], vcc, -1
+; GCN-NEXT:    ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19
+; GCN-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
+; GCN-NEXT:    s_xor_b64 s[4:5], exec, s[6:7]
+; GCN-NEXT:    s_cbranch_execz .LBB1_2
+; GCN-NEXT:  ; %bb.1: ; %F
+; GCN-NEXT:    s_mov_b32 s10, 0
+; GCN-NEXT:    s_mov_b32 s11, 0xf000
+; GCN-NEXT:    s_mov_b32 s8, s10
+; GCN-NEXT:    s_mov_b32 s9, s10
+; GCN-NEXT:    buffer_load_dwordx4 v[4:7], v[2:3], s[8:11], 0 addr64 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_dwordx4 v[8:11], v[2:3], s[8:11], 0 addr64 offset:16 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_dwordx4 v[8:11], v[2:3], s[8:11], 0 addr64 offset:32 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_dwordx4 v[0:3], v[2:3], s[8:11], 0 addr64 offset:48 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    ; implicit-def: $vgpr0
+; GCN-NEXT:  .LBB1_2: ; %Flow
+; GCN-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GCN-NEXT:    s_cbranch_execz .LBB1_4
+; GCN-NEXT:  ; %bb.3: ; %T
+; GCN-NEXT:    s_mov_b32 s10, 0
+; GCN-NEXT:    s_mov_b32 s11, 0xf000
+; GCN-NEXT:    s_mov_b32 s8, s10
+; GCN-NEXT:    s_mov_b32 s9, s10
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[8:11], 0 addr64 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[8:11], 0 addr64 offset:16 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[8:11], 0 addr64 offset:32 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64 offset:48 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:  .LBB1_4: ; %exit
+; GCN-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v1, 0xffff8000
+; GCN-NEXT:    v_cmp_lt_i64_e32 vcc, -1, v[4:5]
+; GCN-NEXT:    v_cndmask_b32_e32 v0, -1, v1, vcc
+; GCN-NEXT:    v_cmp_lt_i64_e32 vcc, -1, v[6:7]
+; GCN-NEXT:    v_cndmask_b32_e32 v2, -1, v1, vcc
+; GCN-NEXT:    v_mov_b32_e32 v1, -1
+; GCN-NEXT:    v_mov_b32_e32 v3, -1
+; GCN-NEXT:    s_setpc_b64 s[30:31]
   br i1 %c0, label %T, label %F
 
 T:
@@ -63,6 +191,64 @@ exit:
 ; GCN-LABEL: extract_4xi64
 ; GCN-COUNT-4: v_cndmask_b32
 define <4 x i64> @extract_4xi64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i1 %c0) {
+; GCN-LABEL: extract_4xi64:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v4, 1, v4
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
+; GCN-NEXT:    s_xor_b64 s[4:5], vcc, -1
+; GCN-NEXT:    ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19
+; GCN-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
+; GCN-NEXT:    s_xor_b64 s[4:5], exec, s[6:7]
+; GCN-NEXT:    s_cbranch_execz .LBB2_2
+; GCN-NEXT:  ; %bb.1: ; %F
+; GCN-NEXT:    s_mov_b32 s10, 0
+; GCN-NEXT:    s_mov_b32 s11, 0xf000
+; GCN-NEXT:    s_mov_b32 s8, s10
+; GCN-NEXT:    s_mov_b32 s9, s10
+; GCN-NEXT:    buffer_load_dwordx4 v[4:7], v[2:3], s[8:11], 0 addr64 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_dwordx4 v[8:11], v[2:3], s[8:11], 0 addr64 offset:16 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_dwordx4 v[12:15], v[2:3], s[8:11], 0 addr64 offset:32 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_dwordx4 v[0:3], v[2:3], s[8:11], 0 addr64 offset:48 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    ; implicit-def: $vgpr0
+; GCN-NEXT:  .LBB2_2: ; %Flow
+; GCN-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GCN-NEXT:    s_cbranch_execz .LBB2_4
+; GCN-NEXT:  ; %bb.3: ; %T
+; GCN-NEXT:    s_mov_b32 s10, 0
+; GCN-NEXT:    s_mov_b32 s11, 0xf000
+; GCN-NEXT:    s_mov_b32 s8, s10
+; GCN-NEXT:    s_mov_b32 s9, s10
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[8:11], 0 addr64 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[8:11], 0 addr64 offset:16 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_dwordx4 v[12:15], v[0:1], s[8:11], 0 addr64 offset:32 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64 offset:48 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:  .LBB2_4: ; %exit
+; GCN-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v1, 0xffff8000
+; GCN-NEXT:    v_cmp_gt_i64_e32 vcc, 0, v[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v1, -1, vcc
+; GCN-NEXT:    v_cmp_gt_i64_e32 vcc, 0, v[6:7]
+; GCN-NEXT:    v_cndmask_b32_e64 v2, v1, -1, vcc
+; GCN-NEXT:    v_cmp_gt_i64_e32 vcc, 0, v[8:9]
+; GCN-NEXT:    v_cndmask_b32_e64 v4, v1, -1, vcc
+; GCN-NEXT:    v_cmp_gt_i64_e32 vcc, 0, v[10:11]
+; GCN-NEXT:    v_cndmask_b32_e64 v6, v1, -1, vcc
+; GCN-NEXT:    v_mov_b32_e32 v1, -1
+; GCN-NEXT:    v_mov_b32_e32 v3, -1
+; GCN-NEXT:    v_mov_b32_e32 v5, -1
+; GCN-NEXT:    v_mov_b32_e32 v7, -1
+; GCN-NEXT:    s_setpc_b64 s[30:31]
   br i1 %c0, label %T, label %F
 
 T:
@@ -84,6 +270,91 @@ exit:
 ; GCN-LABEL: extract_8xi64
 ; GCN-COUNT-8: v_cndmask_b32
 define <8 x i64> @extract_8xi64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i1 %c0) {
+; GCN-LABEL: extract_8xi64:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v4, 1, v4
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
+; GCN-NEXT:    s_xor_b64 s[4:5], vcc, -1
+; GCN-NEXT:    ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35
+; GCN-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
+; GCN-NEXT:    s_xor_b64 s[4:5], exec, s[6:7]
+; GCN-NEXT:    s_cbranch_execz .LBB3_2
+; GCN-NEXT:  ; %bb.1: ; %F
+; GCN-NEXT:    s_mov_b32 s10, 0
+; GCN-NEXT:    s_mov_b32 s11, 0xf000
+; GCN-NEXT:    s_mov_b32 s8, s10
+; GCN-NEXT:    s_mov_b32 s9, s10
+; GCN-NEXT:    buffer_load_dwordx4 v[4:7], v[2:3], s[8:11], 0 addr64 offset:112 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_dwordx4 v[4:7], v[2:3], s[8:11], 0 addr64 offset:96 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_dwordx4 v[4:7], v[2:3], s[8:11], 0 addr64 offset:80 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_dwordx4 v[4:7], v[2:3], s[8:11], 0 addr64 offset:64 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_dwordx4 v[4:7], v[2:3], s[8:11], 0 addr64 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_dwordx4 v[8:11], v[2:3], s[8:11], 0 addr64 offset:16 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_dwordx4 v[12:15], v[2:3], s[8:11], 0 addr64 offset:32 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_dwordx4 v[16:19], v[2:3], s[8:11], 0 addr64 offset:48 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    ; implicit-def: $vgpr0
+; GCN-NEXT:  .LBB3_2: ; %Flow
+; GCN-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GCN-NEXT:    s_cbranch_execz .LBB3_4
+; GCN-NEXT:  ; %bb.3: ; %T
+; GCN-NEXT:    s_mov_b32 s10, 0
+; GCN-NEXT:    s_mov_b32 s11, 0xf000
+; GCN-NEXT:    s_mov_b32 s8, s10
+; GCN-NEXT:    s_mov_b32 s9, s10
+; GCN-NEXT:    buffer_load_dwordx4 v[2:5], v[0:1], s[8:11], 0 addr64 offset:112 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_dwordx4 v[2:5], v[0:1], s[8:11], 0 addr64 offset:96 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_dwordx4 v[2:5], v[0:1], s[8:11], 0 addr64 offset:80 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_dwordx4 v[2:5], v[0:1], s[8:11], 0 addr64 offset:64 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[8:11], 0 addr64 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[8:11], 0 addr64 offset:16 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_dwordx4 v[12:15], v[0:1], s[8:11], 0 addr64 offset:32 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_dwordx4 v[16:19], v[0:1], s[8:11], 0 addr64 offset:48 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:  .LBB3_4: ; %exit
+; GCN-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GCN-NEXT:    v_mov_b32_e32 v1, 0xffff8000
+; GCN-NEXT:    v_cmp_gt_i64_e32 vcc, 0, v[6:7]
+; GCN-NEXT:    v_cmp_gt_i64_e64 s[4:5], 0, v[8:9]
+; GCN-NEXT:    v_cmp_gt_i64_e64 s[6:7], 0, v[10:11]
+; GCN-NEXT:    v_cmp_gt_i64_e64 s[8:9], 0, v[12:13]
+; GCN-NEXT:    v_cmp_gt_i64_e64 s[10:11], 0, v[14:15]
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_cmp_gt_i64_e64 s[12:13], 0, v[16:17]
+; GCN-NEXT:    v_cmp_gt_i64_e64 s[14:15], 0, v[18:19]
+; GCN-NEXT:    v_cmp_gt_i64_e64 s[16:17], 0, v[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v1, -1, s[16:17]
+; GCN-NEXT:    v_cndmask_b32_e64 v2, v1, -1, vcc
+; GCN-NEXT:    v_cndmask_b32_e64 v4, v1, -1, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v6, v1, -1, s[6:7]
+; GCN-NEXT:    v_cndmask_b32_e64 v8, v1, -1, s[8:9]
+; GCN-NEXT:    v_cndmask_b32_e64 v10, v1, -1, s[10:11]
+; GCN-NEXT:    v_cndmask_b32_e64 v12, v1, -1, s[12:13]
+; GCN-NEXT:    v_cndmask_b32_e64 v14, v1, -1, s[14:15]
+; GCN-NEXT:    v_mov_b32_e32 v1, -1
+; GCN-NEXT:    v_mov_b32_e32 v3, -1
+; GCN-NEXT:    v_mov_b32_e32 v5, -1
+; GCN-NEXT:    v_mov_b32_e32 v7, -1
+; GCN-NEXT:    v_mov_b32_e32 v9, -1
+; GCN-NEXT:    v_mov_b32_e32 v11, -1
+; GCN-NEXT:    v_mov_b32_e32 v13, -1
+; GCN-NEXT:    v_mov_b32_e32 v15, -1
+; GCN-NEXT:    s_setpc_b64 s[30:31]
   br i1 %c0, label %T, label %F
 
 T:
@@ -105,6 +376,58 @@ exit:
 ; GCN-LABEL: extract_2xf64
 ; GCN-COUNT-2: v_cndmask_b32
 define <2 x double> @extract_2xf64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i1 %c0) {
+; GCN-LABEL: extract_2xf64:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v4, 1, v4
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
+; GCN-NEXT:    s_xor_b64 s[4:5], vcc, -1
+; GCN-NEXT:    ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19
+; GCN-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
+; GCN-NEXT:    s_xor_b64 s[4:5], exec, s[6:7]
+; GCN-NEXT:    s_cbranch_execz .LBB4_2
+; GCN-NEXT:  ; %bb.1: ; %F
+; GCN-NEXT:    s_mov_b32 s10, 0
+; GCN-NEXT:    s_mov_b32 s11, 0xf000
+; GCN-NEXT:    s_mov_b32 s8, s10
+; GCN-NEXT:    s_mov_b32 s9, s10
+; GCN-NEXT:    buffer_load_dwordx4 v[4:7], v[2:3], s[8:11], 0 addr64 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_dwordx4 v[8:11], v[2:3], s[8:11], 0 addr64 offset:16 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_dwordx4 v[8:11], v[2:3], s[8:11], 0 addr64 offset:32 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_dwordx4 v[0:3], v[2:3], s[8:11], 0 addr64 offset:48 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    ; implicit-def: $vgpr0
+; GCN-NEXT:  .LBB4_2: ; %Flow
+; GCN-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GCN-NEXT:    s_cbranch_execz .LBB4_4
+; GCN-NEXT:  ; %bb.3: ; %T
+; GCN-NEXT:    s_mov_b32 s10, 0
+; GCN-NEXT:    s_mov_b32 s11, 0xf000
+; GCN-NEXT:    s_mov_b32 s8, s10
+; GCN-NEXT:    s_mov_b32 s9, s10
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[8:11], 0 addr64 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[8:11], 0 addr64 offset:16 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[8:11], 0 addr64 offset:32 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64 offset:48 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:  .LBB4_4: ; %exit
+; GCN-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v0, 0xbff00000
+; GCN-NEXT:    v_cmp_lt_f64_e32 vcc, -1.0, v[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v1, v0, -2.0, vcc
+; GCN-NEXT:    v_cmp_lt_f64_e32 vcc, -1.0, v[6:7]
+; GCN-NEXT:    v_cndmask_b32_e64 v3, v0, -2.0, vcc
+; GCN-NEXT:    v_mov_b32_e32 v0, 0
+; GCN-NEXT:    v_mov_b32_e32 v2, 0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
   br i1 %c0, label %T, label %F
 
 T:
@@ -126,6 +449,64 @@ exit:
 ; GCN-LABEL: extract_4xf64
 ; GCN-COUNT-4: v_cndmask_b32
 define <4 x double> @extract_4xf64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i1 %c0) {
+; GCN-LABEL: extract_4xf64:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v4, 1, v4
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
+; GCN-NEXT:    s_xor_b64 s[4:5], vcc, -1
+; GCN-NEXT:    ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19
+; GCN-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
+; GCN-NEXT:    s_xor_b64 s[4:5], exec, s[6:7]
+; GCN-NEXT:    s_cbranch_execz .LBB5_2
+; GCN-NEXT:  ; %bb.1: ; %F
+; GCN-NEXT:    s_mov_b32 s10, 0
+; GCN-NEXT:    s_mov_b32 s11, 0xf000
+; GCN-NEXT:    s_mov_b32 s8, s10
+; GCN-NEXT:    s_mov_b32 s9, s10
+; GCN-NEXT:    buffer_load_dwordx4 v[4:7], v[2:3], s[8:11], 0 addr64 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_dwordx4 v[8:11], v[2:3], s[8:11], 0 addr64 offset:16 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_dwordx4 v[12:15], v[2:3], s[8:11], 0 addr64 offset:32 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_dwordx4 v[0:3], v[2:3], s[8:11], 0 addr64 offset:48 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    ; implicit-def: $vgpr0
+; GCN-NEXT:  .LBB5_2: ; %Flow
+; GCN-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GCN-NEXT:    s_cbranch_execz .LBB5_4
+; GCN-NEXT:  ; %bb.3: ; %T
+; GCN-NEXT:    s_mov_b32 s10, 0
+; GCN-NEXT:    s_mov_b32 s11, 0xf000
+; GCN-NEXT:    s_mov_b32 s8, s10
+; GCN-NEXT:    s_mov_b32 s9, s10
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[8:11], 0 addr64 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[8:11], 0 addr64 offset:16 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_dwordx4 v[12:15], v[0:1], s[8:11], 0 addr64 offset:32 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64 offset:48 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:  .LBB5_4: ; %exit
+; GCN-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v0, 0xbff00000
+; GCN-NEXT:    v_cmp_nlt_f64_e32 vcc, -1.0, v[4:5]
+; GCN-NEXT:    v_cndmask_b32_e32 v1, -2.0, v0, vcc
+; GCN-NEXT:    v_cmp_nlt_f64_e32 vcc, -1.0, v[6:7]
+; GCN-NEXT:    v_cndmask_b32_e32 v3, -2.0, v0, vcc
+; GCN-NEXT:    v_cmp_nlt_f64_e32 vcc, -1.0, v[8:9]
+; GCN-NEXT:    v_cndmask_b32_e32 v5, -2.0, v0, vcc
+; GCN-NEXT:    v_cmp_nlt_f64_e32 vcc, -1.0, v[10:11]
+; GCN-NEXT:    v_cndmask_b32_e32 v7, -2.0, v0, vcc
+; GCN-NEXT:    v_mov_b32_e32 v0, 0
+; GCN-NEXT:    v_mov_b32_e32 v2, 0
+; GCN-NEXT:    v_mov_b32_e32 v4, 0
+; GCN-NEXT:    v_mov_b32_e32 v6, 0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
   br i1 %c0, label %T, label %F
 
 T:
@@ -147,6 +528,91 @@ exit:
 ; GCN-LABEL: extract_8xf64
 ; GCN-COUNT-8: v_cndmask_b32
 define <8 x double> @extract_8xf64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i1 %c0) {
+; GCN-LABEL: extract_8xf64:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v4, 1, v4
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
+; GCN-NEXT:    s_xor_b64 s[4:5], vcc, -1
+; GCN-NEXT:    ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35
+; GCN-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
+; GCN-NEXT:    s_xor_b64 s[4:5], exec, s[6:7]
+; GCN-NEXT:    s_cbranch_execz .LBB6_2
+; GCN-NEXT:  ; %bb.1: ; %F
+; GCN-NEXT:    s_mov_b32 s10, 0
+; GCN-NEXT:    s_mov_b32 s11, 0xf000
+; GCN-NEXT:    s_mov_b32 s8, s10
+; GCN-NEXT:    s_mov_b32 s9, s10
+; GCN-NEXT:    buffer_load_dwordx4 v[4:7], v[2:3], s[8:11], 0 addr64 offset:112 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_dwordx4 v[4:7], v[2:3], s[8:11], 0 addr64 offset:96 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_dwordx4 v[4:7], v[2:3], s[8:11], 0 addr64 offset:80 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_dwordx4 v[4:7], v[2:3], s[8:11], 0 addr64 offset:64 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_dwordx4 v[4:7], v[2:3], s[8:11], 0 addr64 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_dwordx4 v[8:11], v[2:3], s[8:11], 0 addr64 offset:16 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_dwordx4 v[12:15], v[2:3], s[8:11], 0 addr64 offset:32 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_dwordx4 v[16:19], v[2:3], s[8:11], 0 addr64 offset:48 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    ; implicit-def: $vgpr0
+; GCN-NEXT:  .LBB6_2: ; %Flow
+; GCN-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GCN-NEXT:    s_cbranch_execz .LBB6_4
+; GCN-NEXT:  ; %bb.3: ; %T
+; GCN-NEXT:    s_mov_b32 s10, 0
+; GCN-NEXT:    s_mov_b32 s11, 0xf000
+; GCN-NEXT:    s_mov_b32 s8, s10
+; GCN-NEXT:    s_mov_b32 s9, s10
+; GCN-NEXT:    buffer_load_dwordx4 v[2:5], v[0:1], s[8:11], 0 addr64 offset:112 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_dwordx4 v[2:5], v[0:1], s[8:11], 0 addr64 offset:96 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_dwordx4 v[2:5], v[0:1], s[8:11], 0 addr64 offset:80 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_dwordx4 v[2:5], v[0:1], s[8:11], 0 addr64 offset:64 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[8:11], 0 addr64 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[8:11], 0 addr64 offset:16 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_dwordx4 v[12:15], v[0:1], s[8:11], 0 addr64 offset:32 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_dwordx4 v[16:19], v[0:1], s[8:11], 0 addr64 offset:48 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:  .LBB6_4: ; %exit
+; GCN-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GCN-NEXT:    v_mov_b32_e32 v0, 0xbff00000
+; GCN-NEXT:    v_cmp_nlt_f64_e32 vcc, -1.0, v[6:7]
+; GCN-NEXT:    v_cmp_nlt_f64_e64 s[4:5], -1.0, v[8:9]
+; GCN-NEXT:    v_cmp_nlt_f64_e64 s[6:7], -1.0, v[10:11]
+; GCN-NEXT:    v_cmp_nlt_f64_e64 s[8:9], -1.0, v[12:13]
+; GCN-NEXT:    v_cmp_nlt_f64_e64 s[10:11], -1.0, v[14:15]
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_cmp_nlt_f64_e64 s[12:13], -1.0, v[16:17]
+; GCN-NEXT:    v_cmp_nlt_f64_e64 s[14:15], -1.0, v[18:19]
+; GCN-NEXT:    v_cmp_nlt_f64_e64 s[16:17], -1.0, v[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v1, -2.0, v0, s[16:17]
+; GCN-NEXT:    v_cndmask_b32_e32 v3, -2.0, v0, vcc
+; GCN-NEXT:    v_cndmask_b32_e64 v5, -2.0, v0, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v7, -2.0, v0, s[6:7]
+; GCN-NEXT:    v_cndmask_b32_e64 v9, -2.0, v0, s[8:9]
+; GCN-NEXT:    v_cndmask_b32_e64 v11, -2.0, v0, s[10:11]
+; GCN-NEXT:    v_cndmask_b32_e64 v13, -2.0, v0, s[12:13]
+; GCN-NEXT:    v_cndmask_b32_e64 v15, -2.0, v0, s[14:15]
+; GCN-NEXT:    v_mov_b32_e32 v0, 0
+; GCN-NEXT:    v_mov_b32_e32 v2, 0
+; GCN-NEXT:    v_mov_b32_e32 v4, 0
+; GCN-NEXT:    v_mov_b32_e32 v6, 0
+; GCN-NEXT:    v_mov_b32_e32 v8, 0
+; GCN-NEXT:    v_mov_b32_e32 v10, 0
+; GCN-NEXT:    v_mov_b32_e32 v12, 0
+; GCN-NEXT:    v_mov_b32_e32 v14, 0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
   br i1 %c0, label %T, label %F
 
 T:

>From 487f1aff3543051bc0f70fcd958ad3f55e358837 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Mon, 17 Jun 2024 19:07:14 -0700
Subject: [PATCH 14/17] Fix issue

---
 llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
index 4384dbb835ca2..2e95f2fe2710e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
@@ -365,7 +365,7 @@ bool LiveRegOptimizer::optimizeLiveType(Instruction *I) {
       if (ValMap.contains(Op)) {
         Value *NewVal = nullptr;
         if (ValMap.contains(ValMap[Op]))
-          NewVal = ValMap[Op];
+          NewVal = ValMap[ValMap[Op]];
         else {
           BasicBlock::iterator InsertPt = U->getIterator();
           NewVal =

>From db399471dfdd7b8dc534eeaac2ff80dc070d02d4 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Tue, 18 Jun 2024 08:56:30 -0700
Subject: [PATCH 15/17] Handle zero init

Change-Id: I91f4601e201ddd48cae32a59ea77376db69080ce
---
 .../AMDGPU/AMDGPULateCodeGenPrepare.cpp       |  29 ++--
 ...dagcomb-extract-vec-elt-different-sizes.ll |  39 ++---
 llvm/test/CodeGen/AMDGPU/extract-subvector.ll |  32 ----
 .../test/CodeGen/AMDGPU/vni8-across-blocks.ll | 148 +++++++++++++++++-
 4 files changed, 182 insertions(+), 66 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
index 2e95f2fe2710e..8693d733ad6a7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
@@ -306,11 +306,13 @@ bool LiveRegOptimizer::optimizeLiveType(Instruction *I) {
         }
 
         Instruction *IncInst = dyn_cast<Instruction>(V);
-        if (!IncInst)
+        // Other incoming value types (e.g. vector literals) are unhandled
+        if (!IncInst && !isa<ConstantAggregateZero>(V))
           return false;
 
         // Collect all other incoming values for coercion.
-        Defs.insert(IncInst);
+        if (IncInst)
+          Defs.insert(IncInst);
       }
     }
 
@@ -353,9 +355,22 @@ bool LiveRegOptimizer::optimizeLiveType(Instruction *I) {
   // Connect all the PHI nodes with their new incoming values.
   for (PHINode *Phi : PhiNodes) {
     PHINode *NewPhi = cast<PHINode>(ValMap[Phi]);
-    for (int I = 0, E = Phi->getNumIncomingValues(); I < E; I++)
-      NewPhi->addIncoming(ValMap[Phi->getIncomingValue(I)],
-                          Phi->getIncomingBlock(I));
+    bool MisingIncVal = false;
+    for (int I = 0, E = Phi->getNumIncomingValues(); I < E; I++) {
+      Value *IncVal = Phi->getIncomingValue(I);
+      if (isa<ConstantAggregateZero>(IncVal)) {
+        Type *NewType = calculateConvertType(Phi->getType());
+        NewPhi->addIncoming(ConstantInt::get(NewType, 0, false),
+                            Phi->getIncomingBlock(I));
+      } else if (ValMap.contains(IncVal))
+        NewPhi->addIncoming(ValMap[IncVal], Phi->getIncomingBlock(I));
+      else
+        MisingIncVal = true;
+    }
+    if (!MisingIncVal)
+      DeadInstrs.insert(Phi);
+    else
+      DeadInstrs.insert(cast<Instruction>(ValMap[Phi]));
     Visited.insert(NewPhi);
   }
   // Coerce back to the original type and replace the uses.
@@ -379,10 +394,6 @@ bool LiveRegOptimizer::optimizeLiveType(Instruction *I) {
     }
   }
 
-  // Save the removed phis to be deleted later.
-  for (PHINode *Phi : PhiNodes) {
-    DeadInstrs.insert(Phi);
-  }
   return true;
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/dagcomb-extract-vec-elt-different-sizes.ll b/llvm/test/CodeGen/AMDGPU/dagcomb-extract-vec-elt-different-sizes.ll
index 53acbb6a7bceb..1e5ec361d154c 100644
--- a/llvm/test/CodeGen/AMDGPU/dagcomb-extract-vec-elt-different-sizes.ll
+++ b/llvm/test/CodeGen/AMDGPU/dagcomb-extract-vec-elt-different-sizes.ll
@@ -8,29 +8,30 @@ define amdgpu_kernel void @eggs(i1 %arg, ptr addrspace(1) %arg1, ptr %arg2, ptr
 ; CHECK:       ; %bb.0: ; %bb
 ; CHECK-NEXT:    s_load_dword s0, s[4:5], 0x0
 ; CHECK-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x8
+; CHECK-NEXT:    v_mov_b32_e32 v0, 0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_bitcmp0_b32 s0, 0
 ; CHECK-NEXT:    s_cbranch_scc1 .LBB0_2
 ; CHECK-NEXT:  ; %bb.1: ; %bb10
-; CHECK-NEXT:    v_mov_b32_e32 v0, 0
-; CHECK-NEXT:    global_load_dwordx2 v[0:1], v0, s[8:9]
+; CHECK-NEXT:    global_load_dwordx2 v[8:9], v0, s[8:9]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_lshrrev_b32_e32 v7, 8, v0
-; CHECK-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
-; CHECK-NEXT:    v_lshrrev_b32_e32 v5, 24, v0
-; CHECK-NEXT:    v_lshrrev_b32_e32 v4, 8, v1
-; CHECK-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
-; CHECK-NEXT:    v_lshrrev_b32_e32 v2, 24, v1
+; CHECK-NEXT:    v_and_b32_e32 v7, 0xff, v8
+; CHECK-NEXT:    v_bfe_u32 v6, v8, 8, 8
+; CHECK-NEXT:    v_bfe_u32 v5, v8, 16, 8
+; CHECK-NEXT:    v_lshrrev_b32_e32 v4, 24, v8
+; CHECK-NEXT:    v_and_b32_e32 v3, 0xff, v9
+; CHECK-NEXT:    v_bfe_u32 v2, v9, 8, 8
+; CHECK-NEXT:    v_bfe_u32 v1, v9, 16, 8
+; CHECK-NEXT:    v_lshrrev_b32_e32 v0, 24, v9
 ; CHECK-NEXT:    s_branch .LBB0_3
 ; CHECK-NEXT:  .LBB0_2:
+; CHECK-NEXT:    v_mov_b32_e32 v1, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v3, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v4, 0
-; CHECK-NEXT:    v_mov_b32_e32 v1, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v5, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v6, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v7, 0
-; CHECK-NEXT:    v_mov_b32_e32 v0, 0
 ; CHECK-NEXT:  .LBB0_3: ; %bb41
 ; CHECK-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x48
 ; CHECK-NEXT:    v_mov_b32_e32 v8, s10
@@ -47,16 +48,16 @@ define amdgpu_kernel void @eggs(i1 %arg, ptr addrspace(1) %arg1, ptr %arg2, ptr
 ; CHECK-NEXT:    v_mov_b32_e32 v19, s21
 ; CHECK-NEXT:    v_mov_b32_e32 v20, s22
 ; CHECK-NEXT:    v_mov_b32_e32 v21, s23
-; CHECK-NEXT:    flat_store_byte v[8:9], v0
-; CHECK-NEXT:    flat_store_byte v[10:11], v7
-; CHECK-NEXT:    flat_store_byte v[12:13], v6
-; CHECK-NEXT:    flat_store_byte v[14:15], v5
-; CHECK-NEXT:    flat_store_byte v[16:17], v1
-; CHECK-NEXT:    flat_store_byte v[18:19], v4
-; CHECK-NEXT:    flat_store_byte v[20:21], v3
+; CHECK-NEXT:    flat_store_byte v[8:9], v7
+; CHECK-NEXT:    flat_store_byte v[10:11], v6
+; CHECK-NEXT:    flat_store_byte v[12:13], v5
+; CHECK-NEXT:    flat_store_byte v[14:15], v4
+; CHECK-NEXT:    flat_store_byte v[16:17], v3
+; CHECK-NEXT:    flat_store_byte v[18:19], v2
+; CHECK-NEXT:    flat_store_byte v[20:21], v1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; CHECK-NEXT:    flat_store_byte v[0:1], v2
+; CHECK-NEXT:    v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1]
+; CHECK-NEXT:    flat_store_byte v[2:3], v0
 ; CHECK-NEXT:    s_endpgm
 bb:
   br i1 %arg, label %bb10, label %bb41
diff --git a/llvm/test/CodeGen/AMDGPU/extract-subvector.ll b/llvm/test/CodeGen/AMDGPU/extract-subvector.ll
index 0a27767877497..36a93bd2511ce 100644
--- a/llvm/test/CodeGen/AMDGPU/extract-subvector.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract-subvector.ll
@@ -1,26 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
 ; RUN: llc -mtriple=amdgcn-- -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s
 
-; GCN-LABEL: extract_2xi16
-; GCN: buffer_load_ushort
-; GCN: buffer_load_ushort
-; GCN: buffer_load_ushort
-; GCN: buffer_load_ushort
-; GCN: buffer_load_ushort
-; GCN: buffer_load_ushort
-; GCN: buffer_load_ushort
-; GCN: buffer_load_ushort
-; GCN: buffer_load_ushort
-; GCN: buffer_load_ushort
-; GCN: buffer_load_ushort
-; GCN: buffer_load_ushort
-; GCN: buffer_load_ushort
-; GCN: buffer_load_ushort
-; GCN: buffer_load_ushort
-; GCN: buffer_load_ushort
-; GCN: v_bfe_i32
-; GCN: v_bfe_i32
-
 define <2 x i16> @extract_2xi16(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i1 %c0) {
 ; GCN-LABEL: extract_2xi16:
 ; GCN:       ; %bb.0:
@@ -115,8 +95,6 @@ exit:
   ret <2 x i16> %r2
 }
 
-; GCN-LABEL: extract_2xi64
-; GCN-COUNT-2: v_cndmask_b32
 define <2 x i64> @extract_2xi64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i1 %c0) {
 ; GCN-LABEL: extract_2xi64:
 ; GCN:       ; %bb.0:
@@ -188,8 +166,6 @@ exit:
   ret <2 x i64> %r2
 }
 
-; GCN-LABEL: extract_4xi64
-; GCN-COUNT-4: v_cndmask_b32
 define <4 x i64> @extract_4xi64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i1 %c0) {
 ; GCN-LABEL: extract_4xi64:
 ; GCN:       ; %bb.0:
@@ -267,8 +243,6 @@ exit:
   ret <4 x i64> %r2
 }
 
-; GCN-LABEL: extract_8xi64
-; GCN-COUNT-8: v_cndmask_b32
 define <8 x i64> @extract_8xi64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i1 %c0) {
 ; GCN-LABEL: extract_8xi64:
 ; GCN:       ; %bb.0:
@@ -373,8 +347,6 @@ exit:
   ret <8 x i64> %r2
 }
 
-; GCN-LABEL: extract_2xf64
-; GCN-COUNT-2: v_cndmask_b32
 define <2 x double> @extract_2xf64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i1 %c0) {
 ; GCN-LABEL: extract_2xf64:
 ; GCN:       ; %bb.0:
@@ -446,8 +418,6 @@ exit:
   ret <2 x double> %r2
 }
 
-; GCN-LABEL: extract_4xf64
-; GCN-COUNT-4: v_cndmask_b32
 define <4 x double> @extract_4xf64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i1 %c0) {
 ; GCN-LABEL: extract_4xf64:
 ; GCN:       ; %bb.0:
@@ -525,8 +495,6 @@ exit:
   ret <4 x double> %r2
 }
 
-; GCN-LABEL: extract_8xf64
-; GCN-COUNT-8: v_cndmask_b32
 define <8 x double> @extract_8xf64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i1 %c0) {
 ; GCN-LABEL: extract_8xf64:
 ; GCN:       ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
index 6ee4df95308b9..f3cc9ba0a1c1b 100644
--- a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
+++ b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
@@ -456,6 +456,142 @@ bb.3:
   ret void
 }
 
+
+define amdgpu_kernel void @v8i8_phi_zeroinit(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst0, ptr addrspace(1) nocapture %dst1) {
+; GFX906-LABEL: v8i8_phi_zeroinit:
+; GFX906:       ; %bb.0: ; %entry
+; GFX906-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 3, v0
+; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v0
+; GFX906-NEXT:    ; implicit-def: $vgpr1_vgpr2
+; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX906-NEXT:    global_load_dwordx2 v[3:4], v5, s[0:1]
+; GFX906-NEXT:    v_cmp_lt_u32_e64 s[0:1], 14, v0
+; GFX906-NEXT:    s_and_saveexec_b64 s[8:9], vcc
+; GFX906-NEXT:    s_cbranch_execz .LBB9_2
+; GFX906-NEXT:  ; %bb.1: ; %bb.1
+; GFX906-NEXT:    global_load_dwordx2 v[1:2], v5, s[2:3]
+; GFX906-NEXT:    s_mov_b32 s2, 0
+; GFX906-NEXT:    s_mov_b32 s3, s2
+; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 7, v0
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_mov_b32_e32 v4, s3
+; GFX906-NEXT:    v_mov_b32_e32 v3, s2
+; GFX906-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
+; GFX906-NEXT:    s_and_b64 s[2:3], vcc, exec
+; GFX906-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
+; GFX906-NEXT:  .LBB9_2: ; %Flow
+; GFX906-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX906-NEXT:    s_and_saveexec_b64 s[2:3], s[0:1]
+; GFX906-NEXT:    s_cbranch_execz .LBB9_4
+; GFX906-NEXT:  ; %bb.3: ; %bb.2
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_mov_b32_e32 v1, v3
+; GFX906-NEXT:    v_mov_b32_e32 v0, 0
+; GFX906-NEXT:    v_mov_b32_e32 v2, v4
+; GFX906-NEXT:    global_store_dwordx2 v0, v[3:4], s[4:5]
+; GFX906-NEXT:  .LBB9_4: ; %bb.3
+; GFX906-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX906-NEXT:    v_mov_b32_e32 v0, 0
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    global_store_dwordx2 v0, v[1:2], s[6:7]
+; GFX906-NEXT:    s_endpgm
+entry:
+  %idx = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx
+  %vec1 = load <8 x i8>, ptr addrspace(1) %gep1
+  %gep2 = getelementptr <8 x i8>, ptr addrspace(1) %src2, i32 %idx
+  %vec2 = load <8 x i8>, ptr addrspace(1) %gep2
+  %cmp = icmp ult i32 %idx, 15
+  br i1 %cmp, label %bb.1, label %bb.2
+bb.1:
+  %cmp2 = icmp ult i32 %idx, 7
+  br i1 %cmp2, label %bb.2, label %bb.3
+
+bb.2:
+  %tmp5 = phi <8 x i8> [ %vec1, %entry ], [ zeroinitializer, %bb.1 ]
+  store <8 x i8> %tmp5, ptr addrspace(1) %dst0, align 4
+  br label %bb.3
+
+bb.3:
+  %tmp7 = phi <8 x i8> [ %vec2, %bb.1], [%tmp5, %bb.2]
+  store <8 x i8> %tmp7, ptr addrspace(1) %dst1, align 4
+  ret void
+}
+
+define amdgpu_kernel void @v8i8_phi_const(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst0, ptr addrspace(1) nocapture %dst1) {
+; GFX906-LABEL: v8i8_phi_const:
+; GFX906:       ; %bb.0: ; %entry
+; GFX906-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX906-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
+; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v0
+; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX906-NEXT:    global_load_dwordx2 v[1:2], v1, s[0:1]
+; GFX906-NEXT:    v_cmp_lt_u32_e64 s[0:1], 14, v0
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 24, v2
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; GFX906-NEXT:    v_lshrrev_b32_e32 v5, 8, v2
+; GFX906-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
+; GFX906-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
+; GFX906-NEXT:    v_lshrrev_b32_e32 v8, 8, v1
+; GFX906-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX906-NEXT:    s_cbranch_execz .LBB10_2
+; GFX906-NEXT:  ; %bb.1: ; %bb.1
+; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 7, v0
+; GFX906-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
+; GFX906-NEXT:    s_and_b64 s[6:7], vcc, exec
+; GFX906-NEXT:    v_mov_b32_e32 v1, 1
+; GFX906-NEXT:    v_mov_b32_e32 v8, 2
+; GFX906-NEXT:    v_mov_b32_e32 v6, 3
+; GFX906-NEXT:    v_mov_b32_e32 v7, 4
+; GFX906-NEXT:    v_mov_b32_e32 v2, 5
+; GFX906-NEXT:    v_mov_b32_e32 v5, 6
+; GFX906-NEXT:    v_mov_b32_e32 v3, 7
+; GFX906-NEXT:    v_mov_b32_e32 v4, 8
+; GFX906-NEXT:    s_or_b64 s[0:1], s[0:1], s[6:7]
+; GFX906-NEXT:  .LBB10_2: ; %Flow
+; GFX906-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX906-NEXT:    s_and_saveexec_b64 s[2:3], s[0:1]
+; GFX906-NEXT:    s_cbranch_execz .LBB10_4
+; GFX906-NEXT:  ; %bb.3: ; %bb.2
+; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v8
+; GFX906-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v7
+; GFX906-NEXT:    v_or_b32_sdwa v1, v6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v5
+; GFX906-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v4
+; GFX906-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_mov_b32_e32 v9, 0
+; GFX906-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    global_store_dwordx2 v9, v[0:1], s[4:5]
+; GFX906-NEXT:  .LBB10_4: ; %bb.3
+; GFX906-NEXT:    s_endpgm
+entry:
+  %idx = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx
+  %vec1 = load <8 x i8>, ptr addrspace(1) %gep1
+  %gep2 = getelementptr <8 x i8>, ptr addrspace(1) %src2, i32 %idx
+  %vec2 = load <8 x i8>, ptr addrspace(1) %gep2
+  %cmp = icmp ult i32 %idx, 15
+  br i1 %cmp, label %bb.1, label %bb.2
+bb.1:
+  %cmp2 = icmp ult i32 %idx, 7
+  br i1 %cmp2, label %bb.2, label %bb.3
+
+bb.2:
+  %tmp5 = phi <8 x i8> [ %vec1, %entry ], [<i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8>, %bb.1 ]
+  store <8 x i8> %tmp5, ptr addrspace(1) %dst0, align 4
+  br label %bb.3
+
+bb.3:
+  %tmp7 = phi <8 x i8> [ %vec2, %bb.1], [%tmp5, %bb.2]
+  store <8 x i8> %tmp7, ptr addrspace(1) %dst1, align 4
+  ret void
+}
+
 define amdgpu_kernel void @v8i8_multi_block(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst0, ptr addrspace(1) nocapture %dst1) {
 ; GFX906-LABEL: v8i8_multi_block:
 ; GFX906:       ; %bb.0: ; %entry
@@ -469,18 +605,18 @@ define amdgpu_kernel void @v8i8_multi_block(ptr addrspace(1) %src1, ptr addrspac
 ; GFX906-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX906-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX906-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; GFX906-NEXT:    s_cbranch_execz .LBB9_4
+; GFX906-NEXT:    s_cbranch_execz .LBB11_4
 ; GFX906-NEXT:  ; %bb.1: ; %bb.1
 ; GFX906-NEXT:    global_load_dwordx2 v[1:2], v6, s[2:3]
 ; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 7, v0
 ; GFX906-NEXT:    s_and_saveexec_b64 s[2:3], vcc
-; GFX906-NEXT:    s_cbranch_execz .LBB9_3
+; GFX906-NEXT:    s_cbranch_execz .LBB11_3
 ; GFX906-NEXT:  ; %bb.2: ; %bb.2
 ; GFX906-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX906-NEXT:    global_store_dwordx2 v0, v[3:4], s[4:5]
-; GFX906-NEXT:  .LBB9_3: ; %Flow
+; GFX906-NEXT:  .LBB11_3: ; %Flow
 ; GFX906-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX906-NEXT:  .LBB9_4: ; %bb.3
+; GFX906-NEXT:  .LBB11_4: ; %bb.3
 ; GFX906-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    global_store_dwordx2 v5, v[1:2], s[6:7]
@@ -519,13 +655,13 @@ define amdgpu_kernel void @v32i8_loop_carried(ptr addrspace(1) %src1, ptr addrsp
 ; GFX906-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    v_mov_b32_e32 v0, v1
-; GFX906-NEXT:  .LBB10_1: ; %bb.1
+; GFX906-NEXT:  .LBB12_1: ; %bb.1
 ; GFX906-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX906-NEXT:    s_and_b64 s[6:7], exec, vcc
 ; GFX906-NEXT:    s_or_b64 s[2:3], s[6:7], s[2:3]
 ; GFX906-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX906-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX906-NEXT:    s_cbranch_execnz .LBB10_1
+; GFX906-NEXT:    s_cbranch_execnz .LBB12_1
 ; GFX906-NEXT:  ; %bb.2: ; %bb.2.loopexit
 ; GFX906-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX906-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34

>From c1903d00c38776df156aa6f4f092e9b3e14eb36b Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Tue, 18 Jun 2024 13:38:56 -0700
Subject: [PATCH 16/17] Track ValueToValueMap per block for users + fix typo

Change-Id: Ibd63f756e7560df13516efa6d5cb7a6a7b090f3d
---
 .../AMDGPU/AMDGPULateCodeGenPrepare.cpp       |  15 +-
 .../test/CodeGen/AMDGPU/vni8-across-blocks.ll | 143 ++++++++++++++++++
 2 files changed, 152 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
index 8693d733ad6a7..b918f5466a879 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
@@ -96,6 +96,8 @@ class LiveRegOptimizer {
   SmallPtrSet<Instruction *, 4> DeadInstrs;
   /// Map of Value -> Converted Value
   ValueToValueMap ValMap;
+  /// Map of containing conversions from Optimal Type -> Original Type per BB.
+  DenseMap<BasicBlock *, ValueToValueMap> BBUseValMap;
 
 public:
   /// Calculate the and \p return  the type to convert to given a problematic \p
@@ -130,8 +132,8 @@ class LiveRegOptimizer {
     Type *EltTy = VTy->getElementType();
     // If the element size is not less than the convert to scalar size, then we
     // can't do any bit packing
-    if (!EltTy->isIntegerTy() || EltTy->getScalarSizeInBits() >
-                                     -(ConvertToScalar->getScalarSizeInBits()))
+    if (!EltTy->isIntegerTy() ||
+        EltTy->getScalarSizeInBits() > (ConvertToScalar->getScalarSizeInBits()))
       return false;
 
     // Only coerce illegal types
@@ -379,14 +381,15 @@ bool LiveRegOptimizer::optimizeLiveType(Instruction *I) {
     for (auto [OpIdx, Op] : enumerate(U->operands())) {
       if (ValMap.contains(Op)) {
         Value *NewVal = nullptr;
-        if (ValMap.contains(ValMap[Op]))
-          NewVal = ValMap[ValMap[Op]];
+        if (BBUseValMap.contains(U->getParent()) &&
+            BBUseValMap[U->getParent()].contains(ValMap[Op]))
+          NewVal = BBUseValMap[U->getParent()][ValMap[Op]];
         else {
-          BasicBlock::iterator InsertPt = U->getIterator();
+          BasicBlock::iterator InsertPt = U->getParent()->getFirstNonPHIIt();
           NewVal =
               convertFromOptType(Op->getType(), cast<Instruction>(ValMap[Op]),
                                  InsertPt, U->getParent());
-          ValMap[ValMap[Op]] = NewVal;
+          BBUseValMap[U->getParent()][ValMap[Op]] = NewVal;
         }
         assert(NewVal);
         U->setOperand(OpIdx, NewVal);
diff --git a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
index f3cc9ba0a1c1b..8b6bbae890b47 100644
--- a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
+++ b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
@@ -687,5 +687,148 @@ bb.2:
   ret void
 }
 
+; Should not have instances of "Instruction does not dominate all uses!"
+
+define amdgpu_kernel void @v8i8_multiuse_multiblock(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst1, ptr addrspace(1) nocapture %dst2, ptr addrspace(1) nocapture %dst3) {
+; GFX906-LABEL: v8i8_multiuse_multiblock:
+; GFX906:       ; %bb.0: ; %entry
+; GFX906-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x24
+; GFX906-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
+; GFX906-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x44
+; GFX906-NEXT:    v_cmp_lt_u32_e64 s[2:3], 14, v0
+; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v0
+; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX906-NEXT:    global_load_dwordx2 v[1:2], v1, s[4:5]
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GFX906-NEXT:    v_lshrrev_b16_e32 v3, 8, v1
+; GFX906-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX906-NEXT:    s_cbranch_execz .LBB13_2
+; GFX906-NEXT:  ; %bb.1: ; %bb.1
+; GFX906-NEXT:    v_lshlrev_b16_e32 v6, 8, v2
+; GFX906-NEXT:    s_mov_b32 s6, 0x6070504
+; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 7, v0
+; GFX906-NEXT:    v_and_b32_e32 v5, 0xffffff00, v1
+; GFX906-NEXT:    v_or_b32_sdwa v6, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_and_b32_e32 v7, 0xffffff00, v2
+; GFX906-NEXT:    v_perm_b32 v8, v1, v1, s6
+; GFX906-NEXT:    s_andn2_b64 s[2:3], s[2:3], exec
+; GFX906-NEXT:    s_and_b64 s[6:7], vcc, exec
+; GFX906-NEXT:    v_mov_b32_e32 v4, 0
+; GFX906-NEXT:    v_or_b32_sdwa v5, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v7, v1, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+; GFX906-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; GFX906-NEXT:    s_or_b64 s[2:3], s[2:3], s[6:7]
+; GFX906-NEXT:    v_or_b32_e32 v7, v6, v7
+; GFX906-NEXT:    v_or_b32_e32 v5, v6, v5
+; GFX906-NEXT:    global_store_dword v4, v1, s[8:9]
+; GFX906-NEXT:    global_store_dword v4, v8, s[8:9] offset:8
+; GFX906-NEXT:    global_store_dword v4, v7, s[8:9] offset:16
+; GFX906-NEXT:    global_store_dword v4, v5, s[8:9] offset:24
+; GFX906-NEXT:  .LBB13_2: ; %Flow
+; GFX906-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX906-NEXT:    s_and_saveexec_b64 s[4:5], s[2:3]
+; GFX906-NEXT:    s_cbranch_execz .LBB13_4
+; GFX906-NEXT:  ; %bb.3: ; %bb.2
+; GFX906-NEXT:    v_and_b32_e32 v5, 0xffffff00, v2
+; GFX906-NEXT:    v_lshlrev_b16_e32 v4, 8, v2
+; GFX906-NEXT:    v_or_b32_sdwa v5, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_and_b32_e32 v6, 0xffffff00, v1
+; GFX906-NEXT:    s_mov_b32 s2, 0xc0c0001
+; GFX906-NEXT:    v_or_b32_sdwa v4, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+; GFX906-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX906-NEXT:    v_or_b32_sdwa v6, v2, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_perm_b32 v9, 0, v2, s2
+; GFX906-NEXT:    v_mov_b32_e32 v0, 0
+; GFX906-NEXT:    v_or_b32_e32 v4, v5, v4
+; GFX906-NEXT:    v_perm_b32 v7, 0, v1, s2
+; GFX906-NEXT:    s_mov_b32 s3, 0xffff0000
+; GFX906-NEXT:    v_or_b32_e32 v5, v5, v6
+; GFX906-NEXT:    v_lshlrev_b32_e32 v6, 16, v9
+; GFX906-NEXT:    v_and_or_b32 v8, v1, s3, v7
+; GFX906-NEXT:    v_or_b32_e32 v6, v7, v6
+; GFX906-NEXT:    global_store_dword v0, v4, s[10:11]
+; GFX906-NEXT:    global_store_dword v0, v5, s[10:11] offset:8
+; GFX906-NEXT:    global_store_dword v0, v8, s[10:11] offset:16
+; GFX906-NEXT:    global_store_dword v0, v6, s[10:11] offset:24
+; GFX906-NEXT:  .LBB13_4: ; %bb.3
+; GFX906-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX906-NEXT:    v_lshlrev_b16_e32 v7, 8, v2
+; GFX906-NEXT:    v_and_b32_e32 v4, 0xff, v1
+; GFX906-NEXT:    v_and_b32_e32 v5, 0xffffff00, v2
+; GFX906-NEXT:    v_or_b32_e32 v8, v3, v7
+; GFX906-NEXT:    v_or_b32_sdwa v6, v4, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX906-NEXT:    v_and_b32_e32 v8, 0xffff, v8
+; GFX906-NEXT:    v_lshlrev_b16_e32 v9, 8, v1
+; GFX906-NEXT:    v_or_b32_e32 v3, v3, v5
+; GFX906-NEXT:    v_mov_b32_e32 v0, 0
+; GFX906-NEXT:    v_or_b32_e32 v6, v8, v6
+; GFX906-NEXT:    v_or_b32_sdwa v2, v2, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v4, v4, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v1, v1, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX906-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX906-NEXT:    v_or_b32_e32 v1, v8, v1
+; GFX906-NEXT:    v_or_b32_e32 v4, v3, v4
+; GFX906-NEXT:    v_or_b32_e32 v2, v3, v2
+; GFX906-NEXT:    global_store_dword v0, v6, s[0:1]
+; GFX906-NEXT:    global_store_dword v0, v1, s[0:1] offset:8
+; GFX906-NEXT:    global_store_dword v0, v4, s[0:1] offset:16
+; GFX906-NEXT:    global_store_dword v0, v2, s[0:1] offset:24
+; GFX906-NEXT:    s_endpgm
+entry:
+  %idx = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx
+  %vec1 = load <8 x i8>, ptr addrspace(1) %gep1
+  %gep2 = getelementptr <8 x i8>, ptr addrspace(1) %src2, i32 %idx
+  %vec2 = load <8 x i8>, ptr addrspace(1) %gep2
+  %cmp = icmp ult i32 %idx, 15
+  br i1 %cmp, label %bb.1, label %bb.2
+bb.1:
+  %s1 = shufflevector <8 x i8> %vec1, <8 x i8> %vec2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %s2 = shufflevector <8 x i8> %vec1, <8 x i8> %vec2, <4 x i32> <i32 0, i32 1, i32 3, i32 2>
+  %s3 = shufflevector <8 x i8> %vec1, <8 x i8> %vec2, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+  %s4 = shufflevector <8 x i8> %vec1, <8 x i8> %vec2, <4 x i32> <i32 0, i32 2, i32 3, i32 1>
+  %gep4 = getelementptr ptr addrspace(1), ptr addrspace(1) %dst1, i32 0
+  %gep5 = getelementptr ptr addrspace(1), ptr addrspace(1) %dst1, i32 1
+  %gep6 = getelementptr ptr addrspace(1), ptr addrspace(1) %dst1, i32 2
+  %gep7 = getelementptr ptr addrspace(1), ptr addrspace(1) %dst1, i32 3
+  store <4 x i8> %s1, ptr addrspace(1) %gep4, align 4
+  store <4 x i8> %s2, ptr addrspace(1) %gep5, align 4
+  store <4 x i8> %s3, ptr addrspace(1) %gep6, align 4
+  store <4 x i8> %s4, ptr addrspace(1) %gep7, align 4
+  %cmp2 = icmp ult i32 %idx, 7
+  br i1 %cmp2, label %bb.2, label %bb.3
+
+bb.2:
+  %s5 = shufflevector <8 x i8> %vec1, <8 x i8> %vec2, <4 x i32> <i32 0, i32 3, i32 1, i32 2>
+  %s6 = shufflevector <8 x i8> %vec1, <8 x i8> %vec2, <4 x i32> <i32 0, i32 3, i32 2, i32 1>
+  %s7 = shufflevector <8 x i8> %vec1, <8 x i8> %vec2, <4 x i32> <i32 1, i32 0, i32 2, i32 3>
+  %s8 = shufflevector <8 x i8> %vec1, <8 x i8> %vec2, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+  %gep8 = getelementptr ptr addrspace(1), ptr addrspace(1) %dst2, i32 0
+  %gep9 = getelementptr ptr addrspace(1), ptr addrspace(1) %dst2, i32 1
+  %gep10 = getelementptr ptr addrspace(1), ptr addrspace(1) %dst2, i32 2
+  %gep11 = getelementptr ptr addrspace(1), ptr addrspace(1) %dst2, i32 3
+  store <4 x i8> %s5, ptr addrspace(1) %gep8, align 4
+  store <4 x i8> %s6, ptr addrspace(1) %gep9, align 4
+  store <4 x i8> %s7, ptr addrspace(1) %gep10, align 4
+  store <4 x i8> %s8, ptr addrspace(1) %gep11, align 4
+  br label %bb.3
+
+bb.3:
+  %s9 = shufflevector <8 x i8> %vec1, <8 x i8> %vec2, <4 x i32> <i32 1, i32 2, i32 0, i32 3>
+  %s10 = shufflevector <8 x i8> %vec1, <8 x i8> %vec2, <4 x i32> <i32 1, i32 2, i32 3, i32 0>
+  %s11 = shufflevector <8 x i8> %vec1, <8 x i8> %vec2, <4 x i32> <i32 1, i32 3, i32 0, i32 2>
+  %s12 = shufflevector <8 x i8> %vec1, <8 x i8> %vec2, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+  %gep12 = getelementptr ptr addrspace(1), ptr addrspace(1) %dst3, i32 0
+  %gep13 = getelementptr ptr addrspace(1), ptr addrspace(1) %dst3, i32 1
+  %gep14 = getelementptr ptr addrspace(1), ptr addrspace(1) %dst3, i32 2
+  %gep15 = getelementptr ptr addrspace(1), ptr addrspace(1) %dst3, i32 3
+  store <4 x i8> %s9, ptr addrspace(1) %gep12, align 4
+  store <4 x i8> %s10, ptr addrspace(1) %gep13, align 4
+  store <4 x i8> %s11, ptr addrspace(1) %gep14, align 4
+  store <4 x i8> %s12, ptr addrspace(1) %gep15, align 4
+  ret void
+}
+
 
 declare i32 @llvm.amdgcn.workitem.id.x()

>From e55ce0f795b166bb4f23410660f0e11e9654409f Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Tue, 18 Jun 2024 13:38:56 -0700
Subject: [PATCH 17/17] Review comments

Change-Id: Ibd63f756e7560df13516efa6d5cb7a6a7b090f3d
---
 .../AMDGPU/AMDGPULateCodeGenPrepare.cpp       |  37 +-
 .../test/CodeGen/AMDGPU/vni8-across-blocks.ll |   8 +-
 llvm/test/CodeGen/AMDGPU/vni8-live-reg-opt.ll | 352 ++++++++++++++++++
 3 files changed, 371 insertions(+), 26 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/vni8-live-reg-opt.ll

diff --git a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
index b918f5466a879..7623b73d6dd5f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
@@ -133,7 +133,7 @@ class LiveRegOptimizer {
     // If the element size is not less than the convert to scalar size, then we
     // can't do any bit packing
     if (!EltTy->isIntegerTy() ||
-        EltTy->getScalarSizeInBits() > (ConvertToScalar->getScalarSizeInBits()))
+        EltTy->getScalarSizeInBits() > ConvertToScalar->getScalarSizeInBits())
       return false;
 
     // Only coerce illegal types
@@ -220,25 +220,22 @@ Value *LiveRegOptimizer::convertToOptType(Instruction *V,
   // If there is a bitsize match, we can fit the old vector into a new vector of
   // desired type.
   if (OriginalSize == NewSize)
-    return cast<Instruction>(
-        Builder.CreateBitCast(V, NewTy, V->getName() + ".bc"));
+    return Builder.CreateBitCast(V, NewTy, V->getName() + ".bc");
 
   // If there is a bitsize mismatch, we must use a wider vector.
   assert(NewSize > OriginalSize);
   uint64_t ExpandedVecElementCount = NewSize / VTy->getScalarSizeInBits();
 
   SmallVector<int, 8> ShuffleMask;
-  for (unsigned I = 0; I < VTy->getElementCount().getFixedValue(); I++)
+  uint64_t OriginalElementCount = VTy->getElementCount().getFixedValue();
+  for (unsigned I = 0; I < OriginalElementCount; I++)
     ShuffleMask.push_back(I);
 
-  for (uint64_t I = VTy->getElementCount().getFixedValue();
-       I < ExpandedVecElementCount; I++)
-    ShuffleMask.push_back(VTy->getElementCount().getFixedValue());
+  for (uint64_t I = OriginalElementCount; I < ExpandedVecElementCount; I++)
+    ShuffleMask.push_back(OriginalElementCount);
 
-  Instruction *ExpandedVec =
-      cast<Instruction>(Builder.CreateShuffleVector(V, ShuffleMask));
-  return cast<Instruction>(
-      Builder.CreateBitCast(ExpandedVec, NewTy, V->getName() + ".bc"));
+  Value *ExpandedVec = Builder.CreateShuffleVector(V, ShuffleMask);
+  return Builder.CreateBitCast(ExpandedVec, NewTy, V->getName() + ".bc");
 }
 
 Value *LiveRegOptimizer::convertFromOptType(Type *ConvertType, Instruction *V,
@@ -252,8 +249,7 @@ Value *LiveRegOptimizer::convertFromOptType(Type *ConvertType, Instruction *V,
   IRBuilder<> Builder(InsertBB, InsertPt);
   // If there is a bitsize match, we simply convert back to the original type.
   if (OriginalSize == NewSize)
-    return cast<Instruction>(
-        Builder.CreateBitCast(V, NewVTy, V->getName() + ".bc"));
+    return Builder.CreateBitCast(V, NewVTy, V->getName() + ".bc");
 
   // If there is a bitsize mismatch, then we must have used a wider value to
   // hold the bits.
@@ -277,7 +273,7 @@ Value *LiveRegOptimizer::convertFromOptType(Type *ConvertType, Instruction *V,
   SmallVector<int, 8> ShuffleMask(NarrowElementCount);
   std::iota(ShuffleMask.begin(), ShuffleMask.end(), 0);
 
-  return cast<Instruction>(Builder.CreateShuffleVector(Converted, ShuffleMask));
+  return Builder.CreateShuffleVector(Converted, ShuffleMask);
 }
 
 bool LiveRegOptimizer::optimizeLiveType(Instruction *I) {
@@ -349,15 +345,16 @@ bool LiveRegOptimizer::optimizeLiveType(Instruction *I) {
   }
 
   // Construct new-typed PHI nodes.
-  for (PHINode *Phi : PhiNodes)
+  for (PHINode *Phi : PhiNodes) {
     ValMap[Phi] = PHINode::Create(calculateConvertType(Phi->getType()),
                                   Phi->getNumIncomingValues(),
                                   Phi->getName() + ".tc", Phi->getIterator());
+  }
 
   // Connect all the PHI nodes with their new incoming values.
   for (PHINode *Phi : PhiNodes) {
     PHINode *NewPhi = cast<PHINode>(ValMap[Phi]);
-    bool MisingIncVal = false;
+    bool MissingIncVal = false;
     for (int I = 0, E = Phi->getNumIncomingValues(); I < E; I++) {
       Value *IncVal = Phi->getIncomingValue(I);
       if (isa<ConstantAggregateZero>(IncVal)) {
@@ -367,13 +364,9 @@ bool LiveRegOptimizer::optimizeLiveType(Instruction *I) {
       } else if (ValMap.contains(IncVal))
         NewPhi->addIncoming(ValMap[IncVal], Phi->getIncomingBlock(I));
       else
-        MisingIncVal = true;
+        MissingIncVal = true;
     }
-    if (!MisingIncVal)
-      DeadInstrs.insert(Phi);
-    else
-      DeadInstrs.insert(cast<Instruction>(ValMap[Phi]));
-    Visited.insert(NewPhi);
+    DeadInstrs.insert(MissingIncVal ? cast<Instruction>(ValMap[Phi]) : Phi);
   }
   // Coerce back to the original type and replace the uses.
   for (Instruction *U : Uses) {
diff --git a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
index 8b6bbae890b47..0ac7858e29089 100644
--- a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
+++ b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx906 < %s | FileCheck --check-prefix=GFX906 %s
 
 define amdgpu_kernel void @v3i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
@@ -385,9 +385,9 @@ entry:
   %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
   %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
   switch i32 %in, label %return [
-    i32 1, label %return.sink.split
-    i32 2, label %return.sink.split
-    i32 3, label %sw.bb5
+  i32 1, label %return.sink.split
+  i32 2, label %return.sink.split
+  i32 3, label %sw.bb5
   ]
 
 sw.bb5:
diff --git a/llvm/test/CodeGen/AMDGPU/vni8-live-reg-opt.ll b/llvm/test/CodeGen/AMDGPU/vni8-live-reg-opt.ll
new file mode 100644
index 0000000000000..5d2e299aa854a
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/vni8-live-reg-opt.ll
@@ -0,0 +1,352 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -mtriple=amdgcn -mcpu=gfx906 -amdgpu-late-codegenprepare -S -o - %s | FileCheck --check-prefix=GFX906 %s
+
+define amdgpu_kernel void @v3i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
+; GFX906-LABEL: define amdgpu_kernel void @v3i8_liveout(
+; GFX906-SAME: ptr addrspace(1) [[SRC1:%.*]], ptr addrspace(1) [[SRC2:%.*]], ptr addrspace(1) nocapture [[DST:%.*]]) #[[ATTR0:[0-9]+]] {
+; GFX906-NEXT:  entry:
+; GFX906-NEXT:    [[IDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; GFX906-NEXT:    [[GEP1:%.*]] = getelementptr <3 x i8>, ptr addrspace(1) [[SRC1]], i32 [[IDX]]
+; GFX906-NEXT:    [[VEC1:%.*]] = load <3 x i8>, ptr addrspace(1) [[GEP1]], align 4
+; GFX906-NEXT:    [[TMP0:%.*]] = shufflevector <3 x i8> [[VEC1]], <3 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; GFX906-NEXT:    [[VEC1_BC:%.*]] = bitcast <4 x i8> [[TMP0]] to i32
+; GFX906-NEXT:    [[GEP2:%.*]] = getelementptr <3 x i8>, ptr addrspace(1) [[SRC2]], i32 [[IDX]]
+; GFX906-NEXT:    [[VEC2:%.*]] = load <3 x i8>, ptr addrspace(1) [[GEP2]], align 4
+; GFX906-NEXT:    [[TMP1:%.*]] = shufflevector <3 x i8> [[VEC2]], <3 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; GFX906-NEXT:    [[VEC2_BC:%.*]] = bitcast <4 x i8> [[TMP1]] to i32
+; GFX906-NEXT:    [[CMP:%.*]] = icmp ult i32 [[IDX]], 15
+; GFX906-NEXT:    br i1 [[CMP]], label [[BB_1:%.*]], label [[BB_2:%.*]]
+; GFX906:       bb.1:
+; GFX906-NEXT:    br label [[BB_2]]
+; GFX906:       bb.2:
+; GFX906-NEXT:    [[TMP5_TC:%.*]] = phi i32 [ [[VEC1_BC]], [[ENTRY:%.*]] ], [ [[VEC2_BC]], [[BB_1]] ]
+; GFX906-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP5_TC]] to i24
+; GFX906-NEXT:    [[TMP3:%.*]] = bitcast i24 [[TMP2]] to <3 x i8>
+; GFX906-NEXT:    store <3 x i8> [[TMP3]], ptr addrspace(1) [[DST]], align 4
+; GFX906-NEXT:    ret void
+;
+entry:
+  %idx = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep1 = getelementptr <3 x i8>, ptr addrspace(1) %src1, i32 %idx
+  %vec1 = load <3 x i8>, ptr addrspace(1) %gep1
+  %gep2 = getelementptr <3 x i8>, ptr addrspace(1) %src2, i32 %idx
+  %vec2 = load <3 x i8>, ptr addrspace(1) %gep2
+  %cmp = icmp ult i32 %idx, 15
+  br i1 %cmp, label %bb.1, label %bb.2
+bb.1:
+  br label %bb.2
+
+bb.2:
+  %tmp5 = phi <3 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ]
+  store <3 x i8> %tmp5, ptr addrspace(1) %dst, align 4
+  ret void
+}
+
+define amdgpu_kernel void @v4i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
+; GFX906-LABEL: define amdgpu_kernel void @v4i8_liveout(
+; GFX906-SAME: ptr addrspace(1) [[SRC1:%.*]], ptr addrspace(1) [[SRC2:%.*]], ptr addrspace(1) nocapture [[DST:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT:  entry:
+; GFX906-NEXT:    [[IDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; GFX906-NEXT:    [[GEP1:%.*]] = getelementptr <4 x i8>, ptr addrspace(1) [[SRC1]], i32 [[IDX]]
+; GFX906-NEXT:    [[VEC1:%.*]] = load <4 x i8>, ptr addrspace(1) [[GEP1]], align 4
+; GFX906-NEXT:    [[VEC1_BC:%.*]] = bitcast <4 x i8> [[VEC1]] to i32
+; GFX906-NEXT:    [[GEP2:%.*]] = getelementptr <4 x i8>, ptr addrspace(1) [[SRC2]], i32 [[IDX]]
+; GFX906-NEXT:    [[VEC2:%.*]] = load <4 x i8>, ptr addrspace(1) [[GEP2]], align 4
+; GFX906-NEXT:    [[VEC2_BC:%.*]] = bitcast <4 x i8> [[VEC2]] to i32
+; GFX906-NEXT:    [[CMP:%.*]] = icmp ult i32 [[IDX]], 15
+; GFX906-NEXT:    br i1 [[CMP]], label [[BB_1:%.*]], label [[BB_2:%.*]]
+; GFX906:       bb.1:
+; GFX906-NEXT:    br label [[BB_2]]
+; GFX906:       bb.2:
+; GFX906-NEXT:    [[TMP5_TC:%.*]] = phi i32 [ [[VEC1_BC]], [[ENTRY:%.*]] ], [ [[VEC2_BC]], [[BB_1]] ]
+; GFX906-NEXT:    [[TMP5_TC_BC:%.*]] = bitcast i32 [[TMP5_TC]] to <4 x i8>
+; GFX906-NEXT:    store <4 x i8> [[TMP5_TC_BC]], ptr addrspace(1) [[DST]], align 4
+; GFX906-NEXT:    ret void
+;
+entry:
+  %idx = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
+  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
+  %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
+  %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
+  %cmp = icmp ult i32 %idx, 15
+  br i1 %cmp, label %bb.1, label %bb.2
+bb.1:
+  br label %bb.2
+
+bb.2:
+  %tmp5 = phi <4 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ]
+  store <4 x i8> %tmp5, ptr addrspace(1) %dst, align 4
+  ret void
+}
+
+define amdgpu_kernel void @v5i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
+; GFX906-LABEL: define amdgpu_kernel void @v5i8_liveout(
+; GFX906-SAME: ptr addrspace(1) [[SRC1:%.*]], ptr addrspace(1) [[SRC2:%.*]], ptr addrspace(1) nocapture [[DST:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT:  entry:
+; GFX906-NEXT:    [[IDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; GFX906-NEXT:    [[GEP1:%.*]] = getelementptr <5 x i8>, ptr addrspace(1) [[SRC1]], i32 [[IDX]]
+; GFX906-NEXT:    [[VEC1:%.*]] = load <5 x i8>, ptr addrspace(1) [[GEP1]], align 8
+; GFX906-NEXT:    [[TMP0:%.*]] = shufflevector <5 x i8> [[VEC1]], <5 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 5, i32 5>
+; GFX906-NEXT:    [[VEC1_BC:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+; GFX906-NEXT:    [[GEP2:%.*]] = getelementptr <5 x i8>, ptr addrspace(1) [[SRC2]], i32 [[IDX]]
+; GFX906-NEXT:    [[VEC2:%.*]] = load <5 x i8>, ptr addrspace(1) [[GEP2]], align 8
+; GFX906-NEXT:    [[TMP1:%.*]] = shufflevector <5 x i8> [[VEC2]], <5 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 5, i32 5>
+; GFX906-NEXT:    [[VEC2_BC:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+; GFX906-NEXT:    [[CMP:%.*]] = icmp ult i32 [[IDX]], 15
+; GFX906-NEXT:    br i1 [[CMP]], label [[BB_1:%.*]], label [[BB_2:%.*]]
+; GFX906:       bb.1:
+; GFX906-NEXT:    br label [[BB_2]]
+; GFX906:       bb.2:
+; GFX906-NEXT:    [[TMP5_TC:%.*]] = phi <2 x i32> [ [[VEC1_BC]], [[ENTRY:%.*]] ], [ [[VEC2_BC]], [[BB_1]] ]
+; GFX906-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[TMP5_TC]] to <8 x i8>
+; GFX906-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i8> [[TMP2]], <8 x i8> poison, <5 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4>
+; GFX906-NEXT:    store <5 x i8> [[TMP3]], ptr addrspace(1) [[DST]], align 4
+; GFX906-NEXT:    ret void
+;
+entry:
+  %idx = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep1 = getelementptr <5 x i8>, ptr addrspace(1) %src1, i32 %idx
+  %vec1 = load <5 x i8>, ptr addrspace(1) %gep1
+  %gep2 = getelementptr <5 x i8>, ptr addrspace(1) %src2, i32 %idx
+  %vec2 = load <5 x i8>, ptr addrspace(1) %gep2
+  %cmp = icmp ult i32 %idx, 15
+  br i1 %cmp, label %bb.1, label %bb.2
+bb.1:
+  br label %bb.2
+
+bb.2:
+  %tmp5 = phi <5 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ]
+  store <5 x i8> %tmp5, ptr addrspace(1) %dst, align 4
+  ret void
+}
+
+define amdgpu_kernel void @v8i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
+; GFX906-LABEL: define amdgpu_kernel void @v8i8_liveout(
+; GFX906-SAME: ptr addrspace(1) [[SRC1:%.*]], ptr addrspace(1) [[SRC2:%.*]], ptr addrspace(1) nocapture [[DST:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT:  entry:
+; GFX906-NEXT:    [[IDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; GFX906-NEXT:    [[GEP1:%.*]] = getelementptr <8 x i8>, ptr addrspace(1) [[SRC1]], i32 [[IDX]]
+; GFX906-NEXT:    [[VEC1:%.*]] = load <8 x i8>, ptr addrspace(1) [[GEP1]], align 8
+; GFX906-NEXT:    [[VEC1_BC:%.*]] = bitcast <8 x i8> [[VEC1]] to <2 x i32>
+; GFX906-NEXT:    [[GEP2:%.*]] = getelementptr <8 x i8>, ptr addrspace(1) [[SRC2]], i32 [[IDX]]
+; GFX906-NEXT:    [[VEC2:%.*]] = load <8 x i8>, ptr addrspace(1) [[GEP2]], align 8
+; GFX906-NEXT:    [[VEC2_BC:%.*]] = bitcast <8 x i8> [[VEC2]] to <2 x i32>
+; GFX906-NEXT:    [[CMP:%.*]] = icmp ult i32 [[IDX]], 15
+; GFX906-NEXT:    br i1 [[CMP]], label [[BB_1:%.*]], label [[BB_2:%.*]]
+; GFX906:       bb.1:
+; GFX906-NEXT:    br label [[BB_2]]
+; GFX906:       bb.2:
+; GFX906-NEXT:    [[TMP5_TC:%.*]] = phi <2 x i32> [ [[VEC1_BC]], [[ENTRY:%.*]] ], [ [[VEC2_BC]], [[BB_1]] ]
+; GFX906-NEXT:    [[TMP5_TC_BC:%.*]] = bitcast <2 x i32> [[TMP5_TC]] to <8 x i8>
+; GFX906-NEXT:    store <8 x i8> [[TMP5_TC_BC]], ptr addrspace(1) [[DST]], align 4
+; GFX906-NEXT:    ret void
+;
+entry:
+  %idx = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx
+  %vec1 = load <8 x i8>, ptr addrspace(1) %gep1
+  %gep2 = getelementptr <8 x i8>, ptr addrspace(1) %src2, i32 %idx
+  %vec2 = load <8 x i8>, ptr addrspace(1) %gep2
+  %cmp = icmp ult i32 %idx, 15
+  br i1 %cmp, label %bb.1, label %bb.2
+bb.1:
+  br label %bb.2
+
+bb.2:
+  %tmp5 = phi <8 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ]
+  store <8 x i8> %tmp5, ptr addrspace(1) %dst, align 4
+  ret void
+}
+
+define amdgpu_kernel void @repeat_successor(i32 %in, ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
+; GFX906-LABEL: define amdgpu_kernel void @repeat_successor(
+; GFX906-SAME: i32 [[IN:%.*]], ptr addrspace(1) [[SRC1:%.*]], ptr addrspace(1) [[SRC2:%.*]], ptr addrspace(1) nocapture [[DST:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT:  entry:
+; GFX906-NEXT:    [[IDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; GFX906-NEXT:    [[GEP1:%.*]] = getelementptr <4 x i8>, ptr addrspace(1) [[SRC1]], i32 [[IDX]]
+; GFX906-NEXT:    [[VEC1:%.*]] = load <4 x i8>, ptr addrspace(1) [[GEP1]], align 4
+; GFX906-NEXT:    [[VEC1_BC:%.*]] = bitcast <4 x i8> [[VEC1]] to i32
+; GFX906-NEXT:    [[GEP2:%.*]] = getelementptr <4 x i8>, ptr addrspace(1) [[SRC2]], i32 [[IDX]]
+; GFX906-NEXT:    [[VEC2:%.*]] = load <4 x i8>, ptr addrspace(1) [[GEP2]], align 4
+; GFX906-NEXT:    [[VEC2_BC:%.*]] = bitcast <4 x i8> [[VEC2]] to i32
+; GFX906-NEXT:    switch i32 [[IN]], label [[RETURN:%.*]] [
+; GFX906-NEXT:      i32 1, label [[RETURN_SINK_SPLIT:%.*]]
+; GFX906-NEXT:      i32 2, label [[RETURN_SINK_SPLIT]]
+; GFX906-NEXT:      i32 3, label [[SW_BB5:%.*]]
+; GFX906-NEXT:    ]
+; GFX906:       sw.bb5:
+; GFX906-NEXT:    br label [[RETURN_SINK_SPLIT]]
+; GFX906:       return.sink.split:
+; GFX906-NEXT:    [[TMP5_TC:%.*]] = phi i32 [ [[VEC2_BC]], [[SW_BB5]] ], [ [[VEC1_BC]], [[ENTRY:%.*]] ], [ [[VEC1_BC]], [[ENTRY]] ]
+; GFX906-NEXT:    [[TMP5_TC_BC:%.*]] = bitcast i32 [[TMP5_TC]] to <4 x i8>
+; GFX906-NEXT:    store <4 x i8> [[TMP5_TC_BC]], ptr addrspace(1) [[DST]], align 4
+; GFX906-NEXT:    ret void
+; GFX906:       return:
+; GFX906-NEXT:    ret void
+;
+entry:
+  %idx = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
+  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
+  %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
+  %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
+  switch i32 %in, label %return [
+  i32 1, label %return.sink.split
+  i32 2, label %return.sink.split
+  i32 3, label %sw.bb5
+  ]
+
+sw.bb5:
+  br label %return.sink.split
+
+return.sink.split:
+  %tmp5 = phi <4 x i8> [ %vec2, %sw.bb5 ], [ %vec1, %entry ], [ %vec1, %entry ]
+  store <4 x i8> %tmp5, ptr addrspace(1) %dst, align 4
+  ret void
+
+return:
+  ret void
+}
+
+define amdgpu_kernel void @v8i8_phi_chain(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst0, ptr addrspace(1) nocapture %dst1) {
+; GFX906-LABEL: define amdgpu_kernel void @v8i8_phi_chain(
+; GFX906-SAME: ptr addrspace(1) [[SRC1:%.*]], ptr addrspace(1) [[SRC2:%.*]], ptr addrspace(1) nocapture [[DST0:%.*]], ptr addrspace(1) nocapture [[DST1:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT:  entry:
+; GFX906-NEXT:    [[IDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; GFX906-NEXT:    [[GEP1:%.*]] = getelementptr <8 x i8>, ptr addrspace(1) [[SRC1]], i32 [[IDX]]
+; GFX906-NEXT:    [[VEC1:%.*]] = load <8 x i8>, ptr addrspace(1) [[GEP1]], align 8
+; GFX906-NEXT:    [[VEC1_BC:%.*]] = bitcast <8 x i8> [[VEC1]] to <2 x i32>
+; GFX906-NEXT:    [[GEP2:%.*]] = getelementptr <8 x i8>, ptr addrspace(1) [[SRC2]], i32 [[IDX]]
+; GFX906-NEXT:    [[VEC2:%.*]] = load <8 x i8>, ptr addrspace(1) [[GEP2]], align 8
+; GFX906-NEXT:    [[VEC2_BC:%.*]] = bitcast <8 x i8> [[VEC2]] to <2 x i32>
+; GFX906-NEXT:    [[CMP:%.*]] = icmp ult i32 [[IDX]], 15
+; GFX906-NEXT:    br i1 [[CMP]], label [[BB_1:%.*]], label [[BB_2:%.*]]
+; GFX906:       bb.1:
+; GFX906-NEXT:    [[CMP2:%.*]] = icmp ult i32 [[IDX]], 7
+; GFX906-NEXT:    br i1 [[CMP2]], label [[BB_2]], label [[BB_3:%.*]]
+; GFX906:       bb.2:
+; GFX906-NEXT:    [[TMP5_TC:%.*]] = phi <2 x i32> [ [[VEC1_BC]], [[ENTRY:%.*]] ], [ [[VEC2_BC]], [[BB_1]] ]
+; GFX906-NEXT:    [[TMP5_TC_BC:%.*]] = bitcast <2 x i32> [[TMP5_TC]] to <8 x i8>
+; GFX906-NEXT:    store <8 x i8> [[TMP5_TC_BC]], ptr addrspace(1) [[DST0]], align 4
+; GFX906-NEXT:    br label [[BB_3]]
+; GFX906:       bb.3:
+; GFX906-NEXT:    [[TMP7_TC:%.*]] = phi <2 x i32> [ [[VEC2_BC]], [[BB_1]] ], [ [[TMP5_TC]], [[BB_2]] ]
+; GFX906-NEXT:    [[TMP7_TC_BC:%.*]] = bitcast <2 x i32> [[TMP7_TC]] to <8 x i8>
+; GFX906-NEXT:    store <8 x i8> [[TMP7_TC_BC]], ptr addrspace(1) [[DST1]], align 4
+; GFX906-NEXT:    ret void
+;
+entry:
+  %idx = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx
+  %vec1 = load <8 x i8>, ptr addrspace(1) %gep1
+  %gep2 = getelementptr <8 x i8>, ptr addrspace(1) %src2, i32 %idx
+  %vec2 = load <8 x i8>, ptr addrspace(1) %gep2
+  %cmp = icmp ult i32 %idx, 15
+  br i1 %cmp, label %bb.1, label %bb.2
+bb.1:
+  %cmp2 = icmp ult i32 %idx, 7
+  br i1 %cmp2, label %bb.2, label %bb.3
+
+bb.2:
+  %tmp5 = phi <8 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ]
+  store <8 x i8> %tmp5, ptr addrspace(1) %dst0, align 4
+  br label %bb.3
+
+bb.3:
+  %tmp7 = phi <8 x i8> [ %vec2, %bb.1], [%tmp5, %bb.2]
+  store <8 x i8> %tmp7, ptr addrspace(1) %dst1, align 4
+  ret void
+}
+
+define amdgpu_kernel void @v8i8_multi_block(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst0, ptr addrspace(1) nocapture %dst1) {
+; GFX906-LABEL: define amdgpu_kernel void @v8i8_multi_block(
+; GFX906-SAME: ptr addrspace(1) [[SRC1:%.*]], ptr addrspace(1) [[SRC2:%.*]], ptr addrspace(1) nocapture [[DST0:%.*]], ptr addrspace(1) nocapture [[DST1:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT:  entry:
+; GFX906-NEXT:    [[IDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; GFX906-NEXT:    [[GEP1:%.*]] = getelementptr <8 x i8>, ptr addrspace(1) [[SRC1]], i32 [[IDX]]
+; GFX906-NEXT:    [[VEC1:%.*]] = load <8 x i8>, ptr addrspace(1) [[GEP1]], align 8
+; GFX906-NEXT:    [[VEC1_BC:%.*]] = bitcast <8 x i8> [[VEC1]] to <2 x i32>
+; GFX906-NEXT:    [[GEP2:%.*]] = getelementptr <8 x i8>, ptr addrspace(1) [[SRC2]], i32 [[IDX]]
+; GFX906-NEXT:    [[VEC2:%.*]] = load <8 x i8>, ptr addrspace(1) [[GEP2]], align 8
+; GFX906-NEXT:    [[VEC2_BC:%.*]] = bitcast <8 x i8> [[VEC2]] to <2 x i32>
+; GFX906-NEXT:    [[CMP:%.*]] = icmp ult i32 [[IDX]], 15
+; GFX906-NEXT:    br i1 [[CMP]], label [[BB_1:%.*]], label [[BB_3:%.*]]
+; GFX906:       bb.1:
+; GFX906-NEXT:    [[CMP2:%.*]] = icmp ult i32 [[IDX]], 7
+; GFX906-NEXT:    br i1 [[CMP2]], label [[BB_2:%.*]], label [[BB_3]]
+; GFX906:       bb.2:
+; GFX906-NEXT:    [[VEC1_BC_BC:%.*]] = bitcast <2 x i32> [[VEC1_BC]] to <8 x i8>
+; GFX906-NEXT:    store <8 x i8> [[VEC1_BC_BC]], ptr addrspace(1) [[DST0]], align 4
+; GFX906-NEXT:    br label [[BB_3]]
+; GFX906:       bb.3:
+; GFX906-NEXT:    [[TMP5_TC:%.*]] = phi <2 x i32> [ [[VEC1_BC]], [[ENTRY:%.*]] ], [ [[VEC2_BC]], [[BB_1]] ], [ [[VEC2_BC]], [[BB_2]] ]
+; GFX906-NEXT:    [[TMP5_TC_BC:%.*]] = bitcast <2 x i32> [[TMP5_TC]] to <8 x i8>
+; GFX906-NEXT:    store <8 x i8> [[TMP5_TC_BC]], ptr addrspace(1) [[DST1]], align 4
+; GFX906-NEXT:    ret void
+;
+entry:
+  %idx = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx
+  %vec1 = load <8 x i8>, ptr addrspace(1) %gep1
+  %gep2 = getelementptr <8 x i8>, ptr addrspace(1) %src2, i32 %idx
+  %vec2 = load <8 x i8>, ptr addrspace(1) %gep2
+  %cmp = icmp ult i32 %idx, 15
+  br i1 %cmp, label %bb.1, label %bb.3
+bb.1:
+  %cmp2 = icmp ult i32 %idx, 7
+  br i1 %cmp2, label %bb.2, label %bb.3
+
+bb.2:
+  store <8 x i8> %vec1, ptr addrspace(1) %dst0, align 4
+  br label %bb.3
+
+bb.3:
+  %tmp5 = phi <8 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ], [ %vec2, %bb.2]
+  store <8 x i8> %tmp5, ptr addrspace(1) %dst1, align 4
+  ret void
+}
+
+define amdgpu_kernel void @v32i8_loop_carried(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
+; GFX906-LABEL: define amdgpu_kernel void @v32i8_loop_carried(
+; GFX906-SAME: ptr addrspace(1) [[SRC1:%.*]], ptr addrspace(1) [[SRC2:%.*]], ptr addrspace(1) nocapture [[DST:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT:  entry:
+; GFX906-NEXT:    [[IDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; GFX906-NEXT:    [[GEP1:%.*]] = getelementptr <32 x i8>, ptr addrspace(1) [[SRC1]], i32 [[IDX]]
+; GFX906-NEXT:    [[VEC1:%.*]] = load <4 x i8>, ptr addrspace(1) [[GEP1]], align 4
+; GFX906-NEXT:    [[VEC1_BC:%.*]] = bitcast <4 x i8> [[VEC1]] to i32
+; GFX906-NEXT:    br label [[BB_1:%.*]]
+; GFX906:       bb.1:
+; GFX906-NEXT:    [[TEMP_TC:%.*]] = phi i32 [ [[VEC1_BC]], [[ENTRY:%.*]] ], [ [[VEC2_BC:%.*]], [[BB_1]] ]
+; GFX906-NEXT:    [[TEMP_TC_BC:%.*]] = bitcast i32 [[TEMP_TC]] to <4 x i8>
+; GFX906-NEXT:    [[VEC1_BC_BC:%.*]] = bitcast i32 [[VEC1_BC]] to <4 x i8>
+; GFX906-NEXT:    [[VEC2:%.*]] = shufflevector <4 x i8> [[VEC1_BC_BC]], <4 x i8> [[TEMP_TC_BC]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; GFX906-NEXT:    [[VEC2_BC]] = bitcast <4 x i8> [[VEC2]] to i32
+; GFX906-NEXT:    [[CMP:%.*]] = icmp ult i32 [[IDX]], 15
+; GFX906-NEXT:    br i1 [[CMP]], label [[BB_1]], label [[BB_2:%.*]]
+; GFX906:       0:
+; GFX906-NEXT:    br label [[BB_2]]
+; GFX906:       bb.2:
+; GFX906-NEXT:    [[VEC2_BC_BC:%.*]] = bitcast i32 [[VEC2_BC]] to <4 x i8>
+; GFX906-NEXT:    store <4 x i8> [[VEC2_BC_BC]], ptr addrspace(1) [[DST]], align 4
+; GFX906-NEXT:    ret void
+;
+entry:
+  %idx = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep1 = getelementptr <32 x i8>, ptr addrspace(1) %src1, i32 %idx
+  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
+  br label %bb.1
+
+bb.1:
+  %temp = phi <4 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ]
+  %vec2 = shufflevector <4 x i8> %vec1, <4 x i8> %temp, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %cmp = icmp ult i32 %idx, 15
+  br i1 %cmp, label %bb.1, label %bb.2
+  br label %bb.2
+
+bb.2:
+  store <4 x i8> %vec2, ptr addrspace(1) %dst, align 4
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x()



More information about the llvm-commits mailing list